From fe47e490ea0735690a3b509ac4f536e6b32456bd Mon Sep 17 00:00:00 2001 From: Richard Dern Date: Fri, 26 Apr 2024 00:35:17 +0200 Subject: [PATCH] Improved wikidata extractor --- app/Providers/WikidataServiceProvider.php | 3 +- app/Services/Wikidata/WikidataExtractor.php | 101 ++++++++++++++++---- 2 files changed, 86 insertions(+), 18 deletions(-) diff --git a/app/Providers/WikidataServiceProvider.php b/app/Providers/WikidataServiceProvider.php index 74cd3cb..f92819a 100644 --- a/app/Providers/WikidataServiceProvider.php +++ b/app/Providers/WikidataServiceProvider.php @@ -22,8 +22,9 @@ public function register(): void $this->app->singleton(WikidataExtractor::class, function ($app) { $inclusions = $app['config']->get('wikidata.inclusions', []); $exclusions = $app['config']->get('wikidata.exclusions', []); + $templates = $app['config']->get('wikidata.templates', []); - return new WikidataExtractor($exclusions, $inclusions); + return new WikidataExtractor($exclusions, $inclusions, $templates); }); } diff --git a/app/Services/Wikidata/WikidataExtractor.php b/app/Services/Wikidata/WikidataExtractor.php index 4232684..a39f26b 100644 --- a/app/Services/Wikidata/WikidataExtractor.php +++ b/app/Services/Wikidata/WikidataExtractor.php @@ -21,7 +21,9 @@ class WikidataExtractor protected $entities; - public function __construct(protected array $exclusions, protected array $inclusions) + protected array $wikiLinks; + + public function __construct(protected array $exclusions, protected array $inclusions, protected array $templates) { } @@ -45,6 +47,11 @@ public function everythingElse() return $this->everythingElse; } + public function wikiLinks() + { + return $this->wikiLinks; + } + /** * Split data from specified array in three arrays containing explicitely * included properties, explicitely excluded properties and unused @@ -59,9 +66,10 @@ public function extract(array $entityData, string $entityId) $result = $this->browse($entityData['claims']); - $this->included = $result['included']; - $this->excluded = $result['excluded']; - $this->unused = $result['unused']; + $this->included = $result['included']; + $this->excluded = $result['excluded']; + $this->unused = $result['unused']; + $this->wikiLinks = $this->extractWikiLinks($entityData['sitelinks']); $this->everythingElse = $entityData; } @@ -109,18 +117,16 @@ private function browse(array $claims) $isExcluded = in_array($key, $this->exclusions); $isIncluded = in_array($key, collect($this->inclusions)->flatten()->values()->toArray()); $isUnused = !$isExcluded && !$isIncluded; - - $claim = $this->parseClaims($data, $isIncluded); + $claimGroup = $this->parseClaims($key, $data, $isIncluded); if ($isExcluded) { $newKey = $this->replaceValue($key, true, true); - $excluded[$newKey] = $claim; + $excluded[$newKey] = $claimGroup; } elseif ($isIncluded) { - // $newKey = $this->replaceValue($key, true); - $included[$key] = $claim; + $included[$key] = $claimGroup; } elseif ($isUnused) { $newKey = $this->replaceValue($key, true, true); - $unused[$newKey] = $claim; + $unused[$newKey] = $claimGroup; } } @@ -134,12 +140,18 @@ private function browse(array $claims) /** * Parse claims of a specific property */ - private function parseClaims(array $data, bool $parentIncluded) + private function parseClaims(string $propertyId, array $data, bool $parentIncluded) { $result = []; foreach ($data as $claim) { - $result[] = $this->parseClaim($claim, $parentIncluded); + $parsedClaim = $this->parseClaim($propertyId, $claim, $parentIncluded); + + if (is_array($parsedClaim)) { + $result = array_merge_recursive($result, $parsedClaim); + } else { + $result[] = $parsedClaim; + } } return $result; @@ -148,7 +160,7 @@ private function parseClaims(array $data, bool $parentIncluded) /** * Parse a specific claim */ - private function parseClaim(array $data, bool $parentIncluded) + private function parseClaim(string $propertyId, array $data, bool $parentIncluded) { $value = $this->parseSnak($data['mainsnak'], $parentIncluded); @@ -169,7 +181,11 @@ private function parseClaim(array $data, bool $parentIncluded) $value => $itemQualifiers, ]; } else { - $result = $value; + if (in_array($propertyId, $this->inclusions['Distribution'])) { + $result = [$value => []]; + } else { + $result = $value; + } } return $result; @@ -218,12 +234,14 @@ private function parseSnak(array $data, bool $parentIncluded) * Wikidata entities and properties with labels stored in database, if it * applies. */ - private function replaceValue(string $value, bool $showCode = true, bool $showLabel = true) + private function replaceValue(string $value, bool $showCode = true, bool $showLabel = true, ?string $key = null) { $code = $value; $label = $value; - if (array_key_exists($value, $this->properties)) { + if (array_key_exists($value, $this->templates)) { + $label = $this->templates[$value]['title']; + } elseif (array_key_exists($value, $this->properties)) { $label = Str::ucfirst($this->properties[$value]); } elseif (array_key_exists($value, $this->entities)) { $label = $this->entities[$value]; @@ -231,6 +249,14 @@ private function replaceValue(string $value, bool $showCode = true, bool $showLa $both = $code !== $label ? sprintf('[%s] %s', $code, $label) : $value; + if (!empty($key)) { + $template = $this->templates[$key]['template'] ?? null; + + if (!empty($template)) { + return sprintf($template, $value); + } + } + if ($showCode && $showLabel) { return $both; } elseif ($showCode) { @@ -272,10 +298,23 @@ private function includeProperties($includedData, $properties) } $newKey = $this->replaceValue($propertyId, false, true); - $values = $includedData[$propertyId]; $result[$newKey] = $this->includeValues($propertyId, $values); + + foreach ($result[$newKey] as $key => $newValues) { + $newSubKey = $key; + + if (is_string($newValues)) { + $newValues = $this->replaceValue($newValues, false, true, $propertyId); + } else { + $newSubKey = $this->replaceValue($key, false, true, $propertyId); + } + + unset($result[$newKey][$key]); + + $result[$newKey][$newSubKey] = $newValues; + } } return $result; @@ -306,4 +345,32 @@ private function includeValues(string $propertyId, array $values) return $newValues; } + + private function extractWikiLinks(array $siteLinks) + { + $links = ['Liens' => []]; + + foreach ($siteLinks as $name => $data) { + if (!Str::startsWith($name, ['fr', 'en'])) { + continue; + } + + $siteType = Str::remove(['fr', 'en'], $name); + $host = $this->getWikidataHost($siteType); + $url = sprintf('https://%s.%s/wiki/%s', substr($name, 0, 2), $host, urlencode($data['title'])); + + $links['Liens'][$host][] = urldecode($url); + } + + return $links; + } + + private function getWikidataHost($siteType) + { + if ($siteType === 'wiki') { + return 'wikipedia.org'; + } + + return $siteType . '.org'; + } }