1
0

Improved wikidata extractor

This commit is contained in:
Richard Dern 2024-04-26 00:35:17 +02:00
parent 41ce35b3e9
commit fe47e490ea
2 changed files with 86 additions and 18 deletions

View File

@ -22,8 +22,9 @@ public function register(): void
$this->app->singleton(WikidataExtractor::class, function ($app) {
$inclusions = $app['config']->get('wikidata.inclusions', []);
$exclusions = $app['config']->get('wikidata.exclusions', []);
$templates = $app['config']->get('wikidata.templates', []);
return new WikidataExtractor($exclusions, $inclusions);
return new WikidataExtractor($exclusions, $inclusions, $templates);
});
}

View File

@ -21,7 +21,9 @@ class WikidataExtractor
protected $entities;
public function __construct(protected array $exclusions, protected array $inclusions)
protected array $wikiLinks;
public function __construct(protected array $exclusions, protected array $inclusions, protected array $templates)
{
}
@ -45,6 +47,11 @@ public function everythingElse()
return $this->everythingElse;
}
public function wikiLinks()
{
return $this->wikiLinks;
}
/**
* Split data from specified array in three arrays containing explicitely
* included properties, explicitely excluded properties and unused
@ -59,9 +66,10 @@ public function extract(array $entityData, string $entityId)
$result = $this->browse($entityData['claims']);
$this->included = $result['included'];
$this->excluded = $result['excluded'];
$this->unused = $result['unused'];
$this->included = $result['included'];
$this->excluded = $result['excluded'];
$this->unused = $result['unused'];
$this->wikiLinks = $this->extractWikiLinks($entityData['sitelinks']);
$this->everythingElse = $entityData;
}
@ -109,18 +117,16 @@ private function browse(array $claims)
$isExcluded = in_array($key, $this->exclusions);
$isIncluded = in_array($key, collect($this->inclusions)->flatten()->values()->toArray());
$isUnused = !$isExcluded && !$isIncluded;
$claim = $this->parseClaims($data, $isIncluded);
$claimGroup = $this->parseClaims($key, $data, $isIncluded);
if ($isExcluded) {
$newKey = $this->replaceValue($key, true, true);
$excluded[$newKey] = $claim;
$excluded[$newKey] = $claimGroup;
} elseif ($isIncluded) {
// $newKey = $this->replaceValue($key, true);
$included[$key] = $claim;
$included[$key] = $claimGroup;
} elseif ($isUnused) {
$newKey = $this->replaceValue($key, true, true);
$unused[$newKey] = $claim;
$unused[$newKey] = $claimGroup;
}
}
@ -134,12 +140,18 @@ private function browse(array $claims)
/**
* Parse claims of a specific property
*/
private function parseClaims(array $data, bool $parentIncluded)
private function parseClaims(string $propertyId, array $data, bool $parentIncluded)
{
$result = [];
foreach ($data as $claim) {
$result[] = $this->parseClaim($claim, $parentIncluded);
$parsedClaim = $this->parseClaim($propertyId, $claim, $parentIncluded);
if (is_array($parsedClaim)) {
$result = array_merge_recursive($result, $parsedClaim);
} else {
$result[] = $parsedClaim;
}
}
return $result;
@ -148,7 +160,7 @@ private function parseClaims(array $data, bool $parentIncluded)
/**
* Parse a specific claim
*/
private function parseClaim(array $data, bool $parentIncluded)
private function parseClaim(string $propertyId, array $data, bool $parentIncluded)
{
$value = $this->parseSnak($data['mainsnak'], $parentIncluded);
@ -169,7 +181,11 @@ private function parseClaim(array $data, bool $parentIncluded)
$value => $itemQualifiers,
];
} else {
$result = $value;
if (in_array($propertyId, $this->inclusions['Distribution'])) {
$result = [$value => []];
} else {
$result = $value;
}
}
return $result;
@ -218,12 +234,14 @@ private function parseSnak(array $data, bool $parentIncluded)
* Wikidata entities and properties with labels stored in database, if it
* applies.
*/
private function replaceValue(string $value, bool $showCode = true, bool $showLabel = true)
private function replaceValue(string $value, bool $showCode = true, bool $showLabel = true, ?string $key = null)
{
$code = $value;
$label = $value;
if (array_key_exists($value, $this->properties)) {
if (array_key_exists($value, $this->templates)) {
$label = $this->templates[$value]['title'];
} elseif (array_key_exists($value, $this->properties)) {
$label = Str::ucfirst($this->properties[$value]);
} elseif (array_key_exists($value, $this->entities)) {
$label = $this->entities[$value];
@ -231,6 +249,14 @@ private function replaceValue(string $value, bool $showCode = true, bool $showLa
$both = $code !== $label ? sprintf('[%s] %s', $code, $label) : $value;
if (!empty($key)) {
$template = $this->templates[$key]['template'] ?? null;
if (!empty($template)) {
return sprintf($template, $value);
}
}
if ($showCode && $showLabel) {
return $both;
} elseif ($showCode) {
@ -272,10 +298,23 @@ private function includeProperties($includedData, $properties)
}
$newKey = $this->replaceValue($propertyId, false, true);
$values = $includedData[$propertyId];
$result[$newKey] = $this->includeValues($propertyId, $values);
foreach ($result[$newKey] as $key => $newValues) {
$newSubKey = $key;
if (is_string($newValues)) {
$newValues = $this->replaceValue($newValues, false, true, $propertyId);
} else {
$newSubKey = $this->replaceValue($key, false, true, $propertyId);
}
unset($result[$newKey][$key]);
$result[$newKey][$newSubKey] = $newValues;
}
}
return $result;
@ -306,4 +345,32 @@ private function includeValues(string $propertyId, array $values)
return $newValues;
}
private function extractWikiLinks(array $siteLinks)
{
$links = ['Liens' => []];
foreach ($siteLinks as $name => $data) {
if (!Str::startsWith($name, ['fr', 'en'])) {
continue;
}
$siteType = Str::remove(['fr', 'en'], $name);
$host = $this->getWikidataHost($siteType);
$url = sprintf('https://%s.%s/wiki/%s', substr($name, 0, 2), $host, urlencode($data['title']));
$links['Liens'][$host][] = urldecode($url);
}
return $links;
}
private function getWikidataHost($siteType)
{
if ($siteType === 'wiki') {
return 'wikipedia.org';
}
return $siteType . '.org';
}
}