Analyzer component for Cyca
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

319 lines
8.5 KiB

<?php
declare(strict_types=1);
namespace Cyca\Analyzer;
use Cyca\Fetcher\Facades\Fetcher;
use Cyca\Helpers\Helpers\GenericObject;
use Exception;
use Illuminate\Contracts\Filesystem\Filesystem;
use Illuminate\Support\Str;
use League\Flysystem\FileNotFoundException;
/**
* Analyze a document
*/
class Analyzer extends GenericObject
{
/**
* Analyze a URL
*/
const SOURCE_URL = 'url';
/**
* Analyze a file
*/
const SOURCE_FILE = 'file';
/**
* Analyze a file's body
*/
const SOURCE_BODY = 'body';
# --------------------------------------------------------------------------
# ----| Properties |--------------------------------------------------------
# --------------------------------------------------------------------------
/**
* The attributes that should be visible in serialization.
*
* @var array
*/
protected $visible = [
'source',
'sourceUrl',
'sourcePath',
'mimeType',
'fileSize',
'lastModified',
'fetcher',
'analyzers',
'fastMode'
];
/**
* The attributes that are mass assignable.
*
* @var string[]
*/
protected $fillable = [
'source',
'sourceUrl',
'sourcePath',
'mimeType',
'fileSize',
'lastModified',
'fetcher',
'analyzers',
'fastMode'
];
/**
* The attributes that should be cast.
*
* @var array
*/
protected $casts = [
'lastModified' => 'datetime',
'fastMode' => 'boolean'
];
/**
* Filesystem
*
* @var Filesystem|null
*/
protected ?Filesystem $fileSystem = null;
# ----| Accessors |---------------------------------------------------------
/**
* Return instance of filesystem
*
* @return Filesystem|null
*/
public function getFilesystem(): ?Filesystem
{
return $this->fileSystem;
}
# ----| Mutators |----------------------------------------------------------
/**
* Define what we are going to analyze: a URL, a file or a file's body ?
* Once defined, it could not be changed again.
*
* @param string $source
* @return void
*/
protected function setSourceAttribute(?string $source)
{
if (!empty($this->attributes['source'])) {
return;
}
$this->attributes['source'] = $source;
}
/**
* Set Filesystem
*
* @param Filesystem $store
* @return void
*/
public function setFilesystem(Filesystem $store)
{
$this->fileSystem = $store;
}
# --------------------------------------------------------------------------
# ----| Methods |-----------------------------------------------------------
# --------------------------------------------------------------------------
/**
* Analyze a URL. Uses Cyca's Fetcher to fetch file at target URL first.
*
* @param string $url
* @param bool $fastMode Enable fast mode. In this mode, some data won't be fetched.
* @return self
*/
public function analyzeUrl(string $url, bool $fastMode = false): self
{
$this->reset();
$this->fastMode = $fastMode;
$this->source = self::SOURCE_URL;
$this->sourceUrl = $url;
$client = Fetcher::fetch($url);
$this->setFilesystem($client->getFilesystem());
$this->setAttribute('fetcher', $client->toArray());
return $this->analyzeFile($client->filename, $fastMode);
}
/**
* Analyze document body as a string. Requires a filesystem to store this
* body on disk. The resulting file could be deleted after analyzis, which
* is the default behaviour.
*
* @param string $body
* @param bool $fastMode Enable fast mode. In this mode, some data won't be fetched.
* @param bool $deleteAfter Indicate if we should delete the file after analyzis.
* @return self
*/
public function analyzeBody(string $body, bool $fastMode = false, bool $deleteAfter = true): self
{
$this->reset();
$this->fastMode = $fastMode;
$this->source = self::SOURCE_BODY;
$filename = $this->storeBodyOnDisk($body);
return $this->analyzeFile($filename, $fastMode, $deleteAfter);
}
/**
* Analyze file at specified path. A filesystem must have been defined. File
* can be deleted after analyziz. By default, file is preserved.
*
* @param string $file
* @param bool $fastMode Enable fast mode. In this mode, some data won't be fetched.
* @param bool $deleteAfter Delete file after analyzis
* @return self
*/
public function analyzeFile(string $file, bool $fastMode = false, bool $deleteAfter = false): self
{
$this->fastMode = $fastMode;
$this->source = self::SOURCE_FILE;
$this->sourcePath = $file;
if (empty($this->fileSystem)) {
throw new Exception("You need to define a filesystem before analyzing a file");
}
if (!$this->fileSystem->exists($file)) {
throw new FileNotFoundException($file);
}
$this->runAnalyzis();
if ($deleteAfter) {
$this->fileSystem->delete($file);
}
return $this;
}
/**
* Use defined filesystem to store document's body (as a string). Path will
* be constructed from body's md5 and returned as a string, usable from the
* same filesystem.
*
* @param string $body
* @return string
*/
protected function storeBodyOnDisk(string $body): string
{
if (empty($this->fileSystem)) {
throw new Exception("You need to define a filesystem before storing document's body");
}
$hash = md5($body);
$parts = str_split($hash, 2);
$path = implode('/', $parts);
$filename = sprintf('%s/%s', $path, $hash);
$this->fileSystem->put($filename, $body);
return $filename;
}
/**
* Extract mimetype, find appropriate analyzer and run it.
*
* @return void
*/
protected function runAnalyzis(): void
{
$this->storeBasicMetaData();
$analyzers = $this->getAnalyzers();
$attributes = $this->toArray();
foreach ($analyzers as $className) {
$instance = new $className;
$instance
->setFilesystem($this->fileSystem)
->setDiscoveredAttributes($attributes)
->analyze($this->fastMode);
$attributes['analyzers'][$className] = $instance->toArray();
}
$this->setAttribute('analyzers', $attributes['analyzers']);
}
/**
* Recursively find analyzers associated with file's mime type. Returns an
* array of analyzer class names.
*
* @return array
*/
protected function getAnalyzers(): array
{
$classes = [];
$mimeType = Str::replace('/', '_', $this->mimeType);
$groups = config(sprintf('analyzer.analyzers.%s.groups', $mimeType), []);
$analyzers = config(sprintf('analyzer.analyzers.%s.analyzers', $mimeType), []);
foreach ($groups as $subGroup) {
$classes = array_merge($classes, $this->getAnalyzersPerGroup($subGroup));
}
$classes = array_merge($classes, $analyzers);
return $classes;
}
/**
* Return an array of analyzer class names, taken from the configuration
* file, associated with specified group. Groups are recursively fetched.
*
* @param string $groupName
* @return array
*/
protected function getAnalyzersPerGroup(string $groupName): array
{
$classes = [];
$groups = config(sprintf('analyzer.groups.%s.groups', $groupName), []);
$analyzers = config(sprintf('analyzer.groups.%s.analyzers', $groupName), []);
foreach ($groups as $subGroup) {
$classes = array_merge($classes, $this->getAnalyzersPerGroup($subGroup));
}
$classes = array_merge($classes, $analyzers);
return $classes;
}
/**
* Find and store some basic file meta data
*
* @return self
*/
protected function storeBasicMetaData(): self
{
$this->mimeType = $this->fileSystem->mimeType($this->sourcePath);
$this->fileSize = $this->fileSystem->size($this->sourcePath);
$this->lastModified = $this->fileSystem->lastModified($this->sourcePath);
return $this;
}
}