Improved wikidata extractor
This commit is contained in:
parent
41ce35b3e9
commit
fe47e490ea
|
@ -22,8 +22,9 @@ public function register(): void
|
|||
$this->app->singleton(WikidataExtractor::class, function ($app) {
|
||||
$inclusions = $app['config']->get('wikidata.inclusions', []);
|
||||
$exclusions = $app['config']->get('wikidata.exclusions', []);
|
||||
$templates = $app['config']->get('wikidata.templates', []);
|
||||
|
||||
return new WikidataExtractor($exclusions, $inclusions);
|
||||
return new WikidataExtractor($exclusions, $inclusions, $templates);
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
@ -21,7 +21,9 @@ class WikidataExtractor
|
|||
|
||||
protected $entities;
|
||||
|
||||
public function __construct(protected array $exclusions, protected array $inclusions)
|
||||
protected array $wikiLinks;
|
||||
|
||||
public function __construct(protected array $exclusions, protected array $inclusions, protected array $templates)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -45,6 +47,11 @@ public function everythingElse()
|
|||
return $this->everythingElse;
|
||||
}
|
||||
|
||||
public function wikiLinks()
|
||||
{
|
||||
return $this->wikiLinks;
|
||||
}
|
||||
|
||||
/**
|
||||
* Split data from specified array in three arrays containing explicitely
|
||||
* included properties, explicitely excluded properties and unused
|
||||
|
@ -59,9 +66,10 @@ public function extract(array $entityData, string $entityId)
|
|||
|
||||
$result = $this->browse($entityData['claims']);
|
||||
|
||||
$this->included = $result['included'];
|
||||
$this->excluded = $result['excluded'];
|
||||
$this->unused = $result['unused'];
|
||||
$this->included = $result['included'];
|
||||
$this->excluded = $result['excluded'];
|
||||
$this->unused = $result['unused'];
|
||||
$this->wikiLinks = $this->extractWikiLinks($entityData['sitelinks']);
|
||||
|
||||
$this->everythingElse = $entityData;
|
||||
}
|
||||
|
@ -109,18 +117,16 @@ private function browse(array $claims)
|
|||
$isExcluded = in_array($key, $this->exclusions);
|
||||
$isIncluded = in_array($key, collect($this->inclusions)->flatten()->values()->toArray());
|
||||
$isUnused = !$isExcluded && !$isIncluded;
|
||||
|
||||
$claim = $this->parseClaims($data, $isIncluded);
|
||||
$claimGroup = $this->parseClaims($key, $data, $isIncluded);
|
||||
|
||||
if ($isExcluded) {
|
||||
$newKey = $this->replaceValue($key, true, true);
|
||||
$excluded[$newKey] = $claim;
|
||||
$excluded[$newKey] = $claimGroup;
|
||||
} elseif ($isIncluded) {
|
||||
// $newKey = $this->replaceValue($key, true);
|
||||
$included[$key] = $claim;
|
||||
$included[$key] = $claimGroup;
|
||||
} elseif ($isUnused) {
|
||||
$newKey = $this->replaceValue($key, true, true);
|
||||
$unused[$newKey] = $claim;
|
||||
$unused[$newKey] = $claimGroup;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -134,12 +140,18 @@ private function browse(array $claims)
|
|||
/**
|
||||
* Parse claims of a specific property
|
||||
*/
|
||||
private function parseClaims(array $data, bool $parentIncluded)
|
||||
private function parseClaims(string $propertyId, array $data, bool $parentIncluded)
|
||||
{
|
||||
$result = [];
|
||||
|
||||
foreach ($data as $claim) {
|
||||
$result[] = $this->parseClaim($claim, $parentIncluded);
|
||||
$parsedClaim = $this->parseClaim($propertyId, $claim, $parentIncluded);
|
||||
|
||||
if (is_array($parsedClaim)) {
|
||||
$result = array_merge_recursive($result, $parsedClaim);
|
||||
} else {
|
||||
$result[] = $parsedClaim;
|
||||
}
|
||||
}
|
||||
|
||||
return $result;
|
||||
|
@ -148,7 +160,7 @@ private function parseClaims(array $data, bool $parentIncluded)
|
|||
/**
|
||||
* Parse a specific claim
|
||||
*/
|
||||
private function parseClaim(array $data, bool $parentIncluded)
|
||||
private function parseClaim(string $propertyId, array $data, bool $parentIncluded)
|
||||
{
|
||||
$value = $this->parseSnak($data['mainsnak'], $parentIncluded);
|
||||
|
||||
|
@ -169,7 +181,11 @@ private function parseClaim(array $data, bool $parentIncluded)
|
|||
$value => $itemQualifiers,
|
||||
];
|
||||
} else {
|
||||
$result = $value;
|
||||
if (in_array($propertyId, $this->inclusions['Distribution'])) {
|
||||
$result = [$value => []];
|
||||
} else {
|
||||
$result = $value;
|
||||
}
|
||||
}
|
||||
|
||||
return $result;
|
||||
|
@ -218,12 +234,14 @@ private function parseSnak(array $data, bool $parentIncluded)
|
|||
* Wikidata entities and properties with labels stored in database, if it
|
||||
* applies.
|
||||
*/
|
||||
private function replaceValue(string $value, bool $showCode = true, bool $showLabel = true)
|
||||
private function replaceValue(string $value, bool $showCode = true, bool $showLabel = true, ?string $key = null)
|
||||
{
|
||||
$code = $value;
|
||||
$label = $value;
|
||||
|
||||
if (array_key_exists($value, $this->properties)) {
|
||||
if (array_key_exists($value, $this->templates)) {
|
||||
$label = $this->templates[$value]['title'];
|
||||
} elseif (array_key_exists($value, $this->properties)) {
|
||||
$label = Str::ucfirst($this->properties[$value]);
|
||||
} elseif (array_key_exists($value, $this->entities)) {
|
||||
$label = $this->entities[$value];
|
||||
|
@ -231,6 +249,14 @@ private function replaceValue(string $value, bool $showCode = true, bool $showLa
|
|||
|
||||
$both = $code !== $label ? sprintf('[%s] %s', $code, $label) : $value;
|
||||
|
||||
if (!empty($key)) {
|
||||
$template = $this->templates[$key]['template'] ?? null;
|
||||
|
||||
if (!empty($template)) {
|
||||
return sprintf($template, $value);
|
||||
}
|
||||
}
|
||||
|
||||
if ($showCode && $showLabel) {
|
||||
return $both;
|
||||
} elseif ($showCode) {
|
||||
|
@ -272,10 +298,23 @@ private function includeProperties($includedData, $properties)
|
|||
}
|
||||
|
||||
$newKey = $this->replaceValue($propertyId, false, true);
|
||||
|
||||
$values = $includedData[$propertyId];
|
||||
|
||||
$result[$newKey] = $this->includeValues($propertyId, $values);
|
||||
|
||||
foreach ($result[$newKey] as $key => $newValues) {
|
||||
$newSubKey = $key;
|
||||
|
||||
if (is_string($newValues)) {
|
||||
$newValues = $this->replaceValue($newValues, false, true, $propertyId);
|
||||
} else {
|
||||
$newSubKey = $this->replaceValue($key, false, true, $propertyId);
|
||||
}
|
||||
|
||||
unset($result[$newKey][$key]);
|
||||
|
||||
$result[$newKey][$newSubKey] = $newValues;
|
||||
}
|
||||
}
|
||||
|
||||
return $result;
|
||||
|
@ -306,4 +345,32 @@ private function includeValues(string $propertyId, array $values)
|
|||
|
||||
return $newValues;
|
||||
}
|
||||
|
||||
private function extractWikiLinks(array $siteLinks)
|
||||
{
|
||||
$links = ['Liens' => []];
|
||||
|
||||
foreach ($siteLinks as $name => $data) {
|
||||
if (!Str::startsWith($name, ['fr', 'en'])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
$siteType = Str::remove(['fr', 'en'], $name);
|
||||
$host = $this->getWikidataHost($siteType);
|
||||
$url = sprintf('https://%s.%s/wiki/%s', substr($name, 0, 2), $host, urlencode($data['title']));
|
||||
|
||||
$links['Liens'][$host][] = urldecode($url);
|
||||
}
|
||||
|
||||
return $links;
|
||||
}
|
||||
|
||||
private function getWikidataHost($siteType)
|
||||
{
|
||||
if ($siteType === 'wiki') {
|
||||
return 'wikipedia.org';
|
||||
}
|
||||
|
||||
return $siteType . '.org';
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user