1
0
cms11/app/Services/Wikidata/WikidataExtractor.php

377 lines
10 KiB
PHP
Raw Normal View History

2024-04-19 11:21:33 +02:00
<?php
namespace App\Services\Wikidata;
use App\Models\WikidataProperty;
use Illuminate\Support\Str;
2024-04-19 11:21:33 +02:00
class WikidataExtractor
{
protected array $included;
protected array $excluded;
protected array $unused;
protected array $everythingElse;
protected string $entityId;
protected $properties;
protected $entities;
2024-04-26 00:35:17 +02:00
protected array $wikiLinks;
public function __construct(protected array $exclusions, protected array $inclusions, protected array $templates)
{
}
2024-04-19 11:21:33 +02:00
public function included()
{
return $this->included;
}
public function excluded()
{
return $this->excluded;
}
public function unused()
{
return $this->unused;
}
public function everythingElse()
{
return $this->everythingElse;
}
2024-04-26 00:35:17 +02:00
public function wikiLinks()
{
return $this->wikiLinks;
}
2024-04-19 11:21:33 +02:00
/**
* Split data from specified array in three arrays containing explicitely
* included properties, explicitely excluded properties and unused
* properties (neither included or excluded)
*/
public function extract(array $entityData, string $entityId)
{
$json = json_encode($entityData, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE);
$this->properties = $this->getDeclaredPropertiesInEntity($json);
$this->entities = $this->getDeclaredEntitiesInEntity($json, $entityId);
$result = $this->browse($entityData['claims']);
2024-04-26 00:35:17 +02:00
$this->included = $result['included'];
$this->excluded = $result['excluded'];
$this->unused = $result['unused'];
$this->wikiLinks = $this->extractWikiLinks($entityData['sitelinks']);
2024-04-19 11:21:33 +02:00
$this->everythingElse = $entityData;
}
/**
* Return an array containing Wikidata Property ID as keys and corresponding
* label as values
*/
private function getDeclaredPropertiesInEntity(string $data)
{
preg_match_all('/P\d{1,}/', $data, $matches);
natsort($matches[0]);
$ids = collect(array_values($matches[0]))->unique()->all();
$properties = WikidataProperty::whereIn('property_id', $ids)->get();
$result = $properties->pluck('label', 'property_id');
2024-04-19 11:21:33 +02:00
return $result->toArray();
}
/**
* Return an array containing Wikidata Property ID as keys and corresponding
* label as values
*/
private function getDeclaredEntitiesInEntity(string $data, string $entityId)
{
preg_match_all('/Q\d{1,}/', $data, $matches);
natsort($matches[0]);
$ids = collect(array_values($matches[0]))->except($entityId)->unique()->all();
return app()->make(WikidataClient::class)->getLabelsForEntities($ids);
}
/**
* Recursively browse Wikidata array
*/
private function browse(array $claims)
{
$included = [];
$excluded = [];
$unused = [];
foreach ($claims as $key => $data) {
$isExcluded = in_array($key, $this->exclusions);
$isIncluded = in_array($key, collect($this->inclusions)->flatten()->values()->toArray());
$isUnused = !$isExcluded && !$isIncluded;
2024-04-26 00:35:17 +02:00
$claimGroup = $this->parseClaims($key, $data, $isIncluded);
2024-04-19 11:21:33 +02:00
if ($isExcluded) {
$newKey = $this->replaceValue($key, true, true);
2024-04-26 00:35:17 +02:00
$excluded[$newKey] = $claimGroup;
2024-04-19 11:21:33 +02:00
} elseif ($isIncluded) {
2024-04-26 00:35:17 +02:00
$included[$key] = $claimGroup;
2024-04-19 11:21:33 +02:00
} elseif ($isUnused) {
$newKey = $this->replaceValue($key, true, true);
2024-04-26 00:35:17 +02:00
$unused[$newKey] = $claimGroup;
2024-04-19 11:21:33 +02:00
}
}
return [
'excluded' => $excluded,
'included' => $this->reorganizeIncluded($included),
'unused' => $unused,
];
}
/**
* Parse claims of a specific property
*/
2024-04-26 00:35:17 +02:00
private function parseClaims(string $propertyId, array $data, bool $parentIncluded)
2024-04-19 11:21:33 +02:00
{
$result = [];
foreach ($data as $claim) {
2024-04-26 00:35:17 +02:00
$parsedClaim = $this->parseClaim($propertyId, $claim, $parentIncluded);
if (is_array($parsedClaim)) {
$result = array_merge_recursive($result, $parsedClaim);
} else {
$result[] = $parsedClaim;
}
2024-04-19 11:21:33 +02:00
}
return $result;
}
/**
* Parse a specific claim
*/
2024-04-26 00:35:17 +02:00
private function parseClaim(string $propertyId, array $data, bool $parentIncluded)
2024-04-19 11:21:33 +02:00
{
$value = $this->parseSnak($data['mainsnak'], $parentIncluded);
if (!empty($data['qualifiers'])) {
$itemQualifiers = [];
foreach ($data['qualifiers'] as $qualifierProperty => $qualifiers) {
$qualifierKey = $this->replaceValue($qualifierProperty, true, !$parentIncluded);
foreach ($qualifiers as $qualifierData) {
$qualifierValue = $this->parseSnak($qualifierData, $parentIncluded);
$itemQualifiers[$qualifierKey][] = $qualifierValue;
}
}
$result = [
$value => $itemQualifiers,
];
} else {
2024-04-26 00:35:17 +02:00
if (in_array($propertyId, $this->inclusions['Distribution'])) {
$result = [$value => []];
} else {
$result = $value;
}
2024-04-19 11:21:33 +02:00
}
return $result;
}
/**
* Parse a specific snak
*/
private function parseSnak(array $data, bool $parentIncluded)
{
if (empty($data['datavalue']['value'])) {
return $data;
2024-04-19 11:21:33 +02:00
}
$value = $data['datavalue']['value'];
$valueType = $data['datavalue']['type'];
switch ($valueType) {
case 'wikibase-entityid':
$value = $this->replaceValue($value['id'], true, !$parentIncluded);
break;
case 'string':
$value = $this->replaceValue($value, true, !$parentIncluded);
break;
case 'time':
$value = $value['time'];
break;
case 'quantity':
$value = $value['amount'];
break;
case 'monolingualtext':
$value = $value['text'];
break;
case 'globecoordinate':
// Leave it as is
break;
2024-04-19 11:21:33 +02:00
default:
dd($data);
2024-04-19 11:21:33 +02:00
}
return $value;
}
/**
* Replace a value with a more human-friendly version. Basically replaces
* Wikidata entities and properties with labels stored in database, if it
* applies.
*/
2024-04-26 00:35:17 +02:00
private function replaceValue(string $value, bool $showCode = true, bool $showLabel = true, ?string $key = null)
2024-04-19 11:21:33 +02:00
{
$code = $value;
$label = $value;
2024-04-26 00:35:17 +02:00
if (array_key_exists($value, $this->templates)) {
$label = $this->templates[$value]['title'];
} elseif (array_key_exists($value, $this->properties)) {
$label = Str::ucfirst($this->properties[$value]);
2024-04-19 11:21:33 +02:00
} elseif (array_key_exists($value, $this->entities)) {
$label = $this->entities[$value];
}
$both = $code !== $label ? sprintf('[%s] %s', $code, $label) : $value;
2024-04-26 00:35:17 +02:00
if (!empty($key)) {
$template = $this->templates[$key]['template'] ?? null;
if (!empty($template)) {
return sprintf($template, $value);
}
}
2024-04-19 11:21:33 +02:00
if ($showCode && $showLabel) {
return $both;
} elseif ($showCode) {
return $code;
} else {
return $label;
}
}
/**
* Take the "raw" included data and reorganize it according to the
* "inclusions" Wikidata configuration
*/
private function reorganizeIncluded(array $includedData)
{
$reorganized = [];
foreach ($this->inclusions as $category => $properties) {
$result = $this->includeProperties($includedData, $properties);
if (!empty($result)) {
$reorganized[$category] = $result;
}
}
return $reorganized;
}
/**
* Include specific properties
*/
private function includeProperties($includedData, $properties)
{
$result = [];
foreach ($properties as $propertyId) {
if (!array_key_exists($propertyId, $includedData)) {
continue;
}
$newKey = $this->replaceValue($propertyId, false, true);
$values = $includedData[$propertyId];
$result[$newKey] = $this->includeValues($propertyId, $values);
2024-04-26 00:35:17 +02:00
foreach ($result[$newKey] as $key => $newValues) {
$newSubKey = $key;
if (is_string($newValues)) {
$newValues = $this->replaceValue($newValues, false, true, $propertyId);
} else {
$newSubKey = $this->replaceValue($key, false, true, $propertyId);
}
unset($result[$newKey][$key]);
$result[$newKey][$newSubKey] = $newValues;
}
2024-04-19 11:21:33 +02:00
}
return $result;
}
/**
* Include specific values
*/
private function includeValues(string $propertyId, array $values)
2024-04-19 11:21:33 +02:00
{
$newValues = [];
foreach ($values as $key => $value) {
if (in_array($key, $this->exclusions)) {
continue;
}
2024-04-19 11:21:33 +02:00
$newKey = $this->replaceValue($key, false, true);
if (is_array($value)) {
$value = $this->includeValues($propertyId, $value);
2024-04-19 11:21:33 +02:00
} else {
$value = $this->replaceValue($value, false, true);
}
$newValues[$newKey] = $value;
}
return $newValues;
}
2024-04-26 00:35:17 +02:00
private function extractWikiLinks(array $siteLinks)
{
$links = ['Liens' => []];
foreach ($siteLinks as $name => $data) {
if (!Str::startsWith($name, ['fr', 'en'])) {
continue;
}
$siteType = Str::remove(['fr', 'en'], $name);
$host = $this->getWikidataHost($siteType);
$url = sprintf('https://%s.%s/wiki/%s', substr($name, 0, 2), $host, urlencode($data['title']));
$links['Liens'][$host][] = urldecode($url);
}
return $links;
}
private function getWikidataHost($siteType)
{
if ($siteType === 'wiki') {
return 'wikipedia.org';
}
return $siteType . '.org';
}
2024-04-19 11:21:33 +02:00
}