1
0
cms11/app/Services/Wikidata/WikidataExtractor.php

309 lines
8.3 KiB
PHP

<?php
namespace App\Services\Wikidata;
use App\Models\WikidataProperty;
class WikidataExtractor
{
protected array $included;
protected array $excluded;
protected array $unused;
protected array $everythingElse;
protected string $entityId;
protected $properties;
protected $entities;
public function __construct(protected array $exclusions, protected array $inclusions)
{
}
public function included()
{
return $this->included;
}
public function excluded()
{
return $this->excluded;
}
public function unused()
{
return $this->unused;
}
public function everythingElse()
{
return $this->everythingElse;
}
/**
* Split data from specified array in three arrays containing explicitely
* included properties, explicitely excluded properties and unused
* properties (neither included or excluded)
*/
public function extract(array $entityData, string $entityId)
{
$json = json_encode($entityData, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE);
$this->properties = $this->getDeclaredPropertiesInEntity($json);
$this->entities = $this->getDeclaredEntitiesInEntity($json, $entityId);
$result = $this->browse($entityData['claims']);
$this->included = $result['included'];
$this->excluded = $result['excluded'];
$this->unused = $result['unused'];
unset($entityData['claims']);
$this->everythingElse = $entityData;
}
/**
* Return an array containing Wikidata Property ID as keys and corresponding
* label as values
*/
private function getDeclaredPropertiesInEntity(string $data)
{
preg_match_all('/P\d{1,}/', $data, $matches);
natsort($matches[0]);
$ids = collect(array_values($matches[0]))->unique()->all();
$properties = WikidataProperty::whereIn('property_id', $ids)->get();
$result = collect($ids)->combine($properties->pluck('label'));
return $result->toArray();
}
/**
* Return an array containing Wikidata Property ID as keys and corresponding
* label as values
*/
private function getDeclaredEntitiesInEntity(string $data, string $entityId)
{
preg_match_all('/Q\d{1,}/', $data, $matches);
natsort($matches[0]);
$ids = collect(array_values($matches[0]))->except($entityId)->unique()->all();
return app()->make(WikidataClient::class)->getLabelsForEntities($ids);
}
/**
* Recursively browse Wikidata array
*/
private function browse(array $claims)
{
$included = [];
$excluded = [];
$unused = [];
foreach ($claims as $key => $data) {
$isExcluded = in_array($key, $this->exclusions);
$isIncluded = in_array($key, collect($this->inclusions)->flatten()->values()->toArray());
$isUnused = !$isExcluded && !$isIncluded;
$claim = $this->parseClaims($data, $isIncluded);
if ($isExcluded) {
$newKey = $this->replaceValue($key, true, true);
$excluded[$newKey] = $claim;
} elseif ($isIncluded) {
$newKey = $this->replaceValue($key, true);
$included[$key] = $claim;
} elseif ($isUnused) {
$newKey = $this->replaceValue($key, true, true);
$unused[$newKey] = $claim;
}
}
return [
'excluded' => $excluded,
'included' => $this->reorganizeIncluded($included),
'unused' => $unused,
];
}
/**
* Parse claims of a specific property
*/
private function parseClaims(array $data, bool $parentIncluded)
{
$result = [];
foreach ($data as $claim) {
$result[] = $this->parseClaim($claim, $parentIncluded);
}
return $result;
}
/**
* Parse a specific claim
*/
private function parseClaim(array $data, bool $parentIncluded)
{
$value = $this->parseSnak($data['mainsnak'], $parentIncluded);
if (!empty($data['qualifiers'])) {
$itemQualifiers = [];
foreach ($data['qualifiers'] as $qualifierProperty => $qualifiers) {
$qualifierKey = $this->replaceValue($qualifierProperty, true, !$parentIncluded);
foreach ($qualifiers as $qualifierData) {
$qualifierValue = $this->parseSnak($qualifierData, $parentIncluded);
$itemQualifiers[$qualifierKey][] = $qualifierValue;
}
}
$result = [
$value => $itemQualifiers,
];
} else {
$result = $value;
}
return $result;
}
/**
* Parse a specific snak
*/
private function parseSnak(array $data, bool $parentIncluded)
{
if (empty($data['datavalue']['value'])) {
dd($data);
}
$value = $data['datavalue']['value'];
$valueType = $data['datavalue']['type'];
switch ($valueType) {
case 'wikibase-entityid':
$value = $this->replaceValue($value['id'], true, !$parentIncluded);
break;
case 'string':
$value = $this->replaceValue($value, true, !$parentIncluded);
break;
case 'time':
$value = $value['time'];
break;
case 'quantity':
$value = $value['amount'];
break;
case 'monolingualtext':
$value = $value['text'];
break;
default:
dd($data['mainsnak']);
}
return $value;
}
/**
* Replace a value with a more human-friendly version. Basically replaces
* Wikidata entities and properties with labels stored in database, if it
* applies.
*/
private function replaceValue(string $value, bool $showCode = true, bool $showLabel = true)
{
$isExcluded = in_array($value, $this->exclusions);
$isIncluded = in_array($value, collect($this->inclusions)->flatten()->values()->toArray());
$isUnused = !$isExcluded && !$isIncluded;
$code = $value;
$label = $value;
if (array_key_exists($value, $this->properties)) {
$label = $this->properties[$value];
} elseif (array_key_exists($value, $this->entities)) {
$label = $this->entities[$value];
}
$both = $code !== $label ? sprintf('[%s] %s', $code, $label) : $value;
if ($showCode && $showLabel) {
return $both;
} elseif ($showCode) {
return $code;
} else {
return $label;
}
}
/**
* Take the "raw" included data and reorganize it according to the
* "inclusions" Wikidata configuration
*/
private function reorganizeIncluded(array $includedData)
{
$reorganized = [];
foreach ($this->inclusions as $category => $properties) {
$result = $this->includeProperties($includedData, $properties);
if (!empty($result)) {
$reorganized[$category] = $result;
}
}
return $reorganized;
}
/**
* Include specific properties
*/
private function includeProperties($includedData, $properties)
{
$result = [];
foreach ($properties as $propertyId) {
if (!array_key_exists($propertyId, $includedData)) {
continue;
}
$newKey = $this->replaceValue($propertyId, false, true);
$values = $includedData[$propertyId];
$result[$newKey] = $this->includeValues($values);
}
return $result;
}
/**
* Include specific values
*/
private function includeValues(array $values)
{
$newValues = [];
foreach ($values as $key => $value) {
$newKey = $this->replaceValue($key, false, true);
if (is_array($value)) {
$value = $this->includeValues($value);
} else {
$value = $this->replaceValue($value, false, true);
}
$newValues[$newKey] = $value;
}
return $newValues;
}
}