382 lines
11 KiB
PHP
382 lines
11 KiB
PHP
<?php
|
|
|
|
namespace App\Services\Wikidata;
|
|
|
|
use App\Models\WikidataProperty;
|
|
use Illuminate\Support\Str;
|
|
|
|
class WikidataExtractor
|
|
{
|
|
protected array $included;
|
|
|
|
protected array $excluded;
|
|
|
|
protected array $unused;
|
|
|
|
protected array $everythingElse;
|
|
|
|
protected string $entityId;
|
|
|
|
protected $properties;
|
|
|
|
protected $entities;
|
|
|
|
protected array $wikiLinks;
|
|
|
|
public function __construct(protected array $exclusions, protected array $inclusions, protected array $templates)
|
|
{
|
|
}
|
|
|
|
public function included()
|
|
{
|
|
return $this->included;
|
|
}
|
|
|
|
public function excluded()
|
|
{
|
|
return $this->excluded;
|
|
}
|
|
|
|
public function unused()
|
|
{
|
|
return $this->unused;
|
|
}
|
|
|
|
public function everythingElse()
|
|
{
|
|
return $this->everythingElse;
|
|
}
|
|
|
|
public function wikiLinks()
|
|
{
|
|
return $this->wikiLinks;
|
|
}
|
|
|
|
/**
|
|
* Split data from specified array in three arrays containing explicitely
|
|
* included properties, explicitely excluded properties and unused
|
|
* properties (neither included or excluded)
|
|
*/
|
|
public function extract(array $entityData, string $entityId)
|
|
{
|
|
$json = json_encode($entityData, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE);
|
|
|
|
$this->properties = $this->getDeclaredPropertiesInEntity($json);
|
|
$this->entities = $this->getDeclaredEntitiesInEntity($json, $entityId);
|
|
|
|
$result = $this->browse($entityData['claims']);
|
|
|
|
$this->included = $result['included'];
|
|
$this->excluded = $result['excluded'];
|
|
$this->unused = $result['unused'];
|
|
$this->wikiLinks = $this->extractWikiLinks($entityData['sitelinks']);
|
|
|
|
$this->everythingElse = $entityData;
|
|
}
|
|
|
|
/**
|
|
* Return an array containing Wikidata Property ID as keys and corresponding
|
|
* label as values
|
|
*/
|
|
private function getDeclaredPropertiesInEntity(string $data)
|
|
{
|
|
preg_match_all('/P\d{1,}/', $data, $matches);
|
|
natsort($matches[0]);
|
|
|
|
$ids = collect(array_values($matches[0]))->unique()->all();
|
|
$properties = WikidataProperty::whereIn('property_id', $ids)->get();
|
|
$result = $properties->pluck('label', 'property_id');
|
|
|
|
return $result->toArray();
|
|
}
|
|
|
|
/**
|
|
* Return an array containing Wikidata Property ID as keys and corresponding
|
|
* label as values
|
|
*/
|
|
private function getDeclaredEntitiesInEntity(string $data, string $entityId)
|
|
{
|
|
preg_match_all('/Q\d{1,}/', $data, $matches);
|
|
natsort($matches[0]);
|
|
|
|
$ids = collect(array_values($matches[0]))->except($entityId)->unique()->all();
|
|
|
|
return app()->make(WikidataClient::class)->getLabelsForEntities($ids);
|
|
}
|
|
|
|
/**
|
|
* Recursively browse Wikidata array
|
|
*/
|
|
private function browse(array $claims)
|
|
{
|
|
$included = [];
|
|
$excluded = [];
|
|
$unused = [];
|
|
|
|
foreach ($claims as $key => $data) {
|
|
$isExcluded = in_array($key, $this->exclusions);
|
|
$isIncluded = in_array($key, collect($this->inclusions)->flatten()->values()->toArray());
|
|
$isUnused = !$isExcluded && !$isIncluded;
|
|
$claimGroup = $this->parseClaims($key, $data, $isIncluded);
|
|
|
|
if ($isExcluded) {
|
|
$newKey = $this->replaceValue($key, true, true);
|
|
$excluded[$newKey] = $claimGroup;
|
|
} elseif ($isIncluded) {
|
|
$included[$key] = $claimGroup;
|
|
} elseif ($isUnused) {
|
|
$newKey = $this->replaceValue($key, true, true);
|
|
$unused[$newKey] = $claimGroup;
|
|
}
|
|
}
|
|
|
|
return [
|
|
'excluded' => $excluded,
|
|
'included' => $this->reorganizeIncluded($included),
|
|
'unused' => $unused,
|
|
];
|
|
}
|
|
|
|
/**
|
|
* Parse claims of a specific property
|
|
*/
|
|
private function parseClaims(string $propertyId, array $data, bool $parentIncluded)
|
|
{
|
|
$result = [];
|
|
|
|
foreach ($data as $claim) {
|
|
$parsedClaim = $this->parseClaim($propertyId, $claim, $parentIncluded);
|
|
|
|
if (is_array($parsedClaim)) {
|
|
$result = array_merge_recursive($result, $parsedClaim);
|
|
} else {
|
|
$result[] = $parsedClaim;
|
|
}
|
|
}
|
|
|
|
return $result;
|
|
}
|
|
|
|
/**
|
|
* Parse a specific claim
|
|
*/
|
|
private function parseClaim(string $propertyId, array $data, bool $parentIncluded)
|
|
{
|
|
$value = $this->parseSnak($data['mainsnak'], $parentIncluded);
|
|
|
|
if (!empty($data['qualifiers'])) {
|
|
$itemQualifiers = [];
|
|
|
|
foreach ($data['qualifiers'] as $qualifierProperty => $qualifiers) {
|
|
$qualifierKey = $this->replaceValue($qualifierProperty, true, !$parentIncluded);
|
|
|
|
foreach ($qualifiers as $qualifierData) {
|
|
$qualifierValue = $this->parseSnak($qualifierData, $parentIncluded);
|
|
|
|
$itemQualifiers[$qualifierKey][] = $qualifierValue;
|
|
}
|
|
}
|
|
|
|
$result = [
|
|
$value => $itemQualifiers,
|
|
];
|
|
} else {
|
|
if (in_array($propertyId, $this->inclusions['Distribution'])) {
|
|
$result = [$value => []];
|
|
} else {
|
|
$result = $value;
|
|
}
|
|
}
|
|
|
|
return $result;
|
|
}
|
|
|
|
/**
|
|
* Parse a specific snak
|
|
*/
|
|
private function parseSnak(array $data, bool $parentIncluded)
|
|
{
|
|
if (empty($data['datavalue']['value'])) {
|
|
return $data;
|
|
}
|
|
|
|
$value = $data['datavalue']['value'];
|
|
$valueType = $data['datavalue']['type'];
|
|
$dataType = $data['datatype'];
|
|
|
|
if ($dataType === 'external-id') {
|
|
$value = $this->replaceValue($value, true, !$parentIncluded, $data['property']);
|
|
} else {
|
|
switch ($valueType) {
|
|
case 'wikibase-entityid':
|
|
$value = $this->replaceValue($value['id'], true, !$parentIncluded);
|
|
break;
|
|
case 'string':
|
|
$value = $this->replaceValue($value, true, !$parentIncluded);
|
|
break;
|
|
case 'time':
|
|
$value = $value['time'];
|
|
break;
|
|
case 'quantity':
|
|
$value = $value['amount'];
|
|
break;
|
|
case 'monolingualtext':
|
|
$value = $value['text'];
|
|
break;
|
|
case 'globecoordinate':
|
|
// Leave it as is
|
|
break;
|
|
default:
|
|
dd($data);
|
|
}
|
|
}
|
|
|
|
return $value;
|
|
}
|
|
|
|
/**
|
|
* Replace a value with a more human-friendly version. Basically replaces
|
|
* Wikidata entities and properties with labels stored in database, if it
|
|
* applies.
|
|
*/
|
|
private function replaceValue(string $value, bool $showCode = true, bool $showLabel = true, ?string $key = null)
|
|
{
|
|
$code = $value;
|
|
$label = $value;
|
|
|
|
if (array_key_exists($value, $this->templates)) {
|
|
$label = $this->templates[$value]['title'];
|
|
} elseif (array_key_exists($value, $this->properties)) {
|
|
$label = Str::ucfirst($this->properties[$value]);
|
|
} elseif (array_key_exists($value, $this->entities)) {
|
|
$label = $this->entities[$value];
|
|
}
|
|
|
|
$both = $code !== $label ? sprintf('[%s] %s', $code, $label) : $value;
|
|
|
|
if (!empty($key) && !empty($value)) {
|
|
$template = $this->templates[$key]['template'] ?? null;
|
|
|
|
if (!empty($template)) {
|
|
return sprintf($template, $value);
|
|
}
|
|
}
|
|
|
|
if ($showCode && $showLabel) {
|
|
return $both;
|
|
} elseif ($showCode) {
|
|
return $code;
|
|
} else {
|
|
return $label;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Take the "raw" included data and reorganize it according to the
|
|
* "inclusions" Wikidata configuration
|
|
*/
|
|
private function reorganizeIncluded(array $includedData)
|
|
{
|
|
$reorganized = [];
|
|
|
|
foreach ($this->inclusions as $category => $properties) {
|
|
$result = $this->includeProperties($includedData, $properties);
|
|
|
|
if (!empty($result)) {
|
|
$reorganized[$category] = $result;
|
|
}
|
|
}
|
|
|
|
return $reorganized;
|
|
}
|
|
|
|
/**
|
|
* Include specific properties
|
|
*/
|
|
private function includeProperties($includedData, $properties)
|
|
{
|
|
$result = [];
|
|
|
|
foreach ($properties as $propertyId) {
|
|
if (!array_key_exists($propertyId, $includedData)) {
|
|
continue;
|
|
}
|
|
|
|
$newKey = $this->replaceValue($propertyId, false, true);
|
|
$values = $includedData[$propertyId];
|
|
|
|
$result[$newKey] = $this->includeValues($propertyId, $values);
|
|
|
|
foreach ($result[$newKey] as $key => $newValues) {
|
|
$newSubKey = $key;
|
|
|
|
if (is_string($newValues)) {
|
|
$newValues = $this->replaceValue($newValues, false, true);
|
|
} else {
|
|
$newSubKey = $this->replaceValue($key, false, true);
|
|
}
|
|
|
|
unset($result[$newKey][$key]);
|
|
|
|
$result[$newKey][$newSubKey] = $newValues;
|
|
}
|
|
}
|
|
|
|
return $result;
|
|
}
|
|
|
|
/**
|
|
* Include specific values
|
|
*/
|
|
private function includeValues(string $propertyId, array $values)
|
|
{
|
|
$newValues = [];
|
|
|
|
foreach ($values as $key => $value) {
|
|
if (in_array($key, $this->exclusions)) {
|
|
continue;
|
|
}
|
|
|
|
$newKey = $this->replaceValue($key, false, true);
|
|
|
|
if (is_array($value)) {
|
|
$value = $this->includeValues($propertyId, $value);
|
|
} else {
|
|
$value = $this->replaceValue($value, false, true);
|
|
}
|
|
|
|
$newValues[$newKey] = $value;
|
|
}
|
|
|
|
return $newValues;
|
|
}
|
|
|
|
private function extractWikiLinks(array $siteLinks)
|
|
{
|
|
$links = ['Liens' => []];
|
|
|
|
foreach ($siteLinks as $name => $data) {
|
|
if (!Str::startsWith($name, ['fr', 'en'])) {
|
|
continue;
|
|
}
|
|
|
|
$siteType = Str::remove(['fr', 'en'], $name);
|
|
$host = $this->getWikidataHost($siteType);
|
|
$url = sprintf('https://%s.%s/wiki/%s', substr($name, 0, 2), $host, urlencode($data['title']));
|
|
|
|
$links['Liens'][$host][] = urldecode($url);
|
|
}
|
|
|
|
return $links;
|
|
}
|
|
|
|
private function getWikidataHost($siteType)
|
|
{
|
|
if ($siteType === 'wiki') {
|
|
return 'wikipedia.org';
|
|
}
|
|
|
|
return $siteType . '.org';
|
|
}
|
|
}
|