Browse Source

2-merge-analyzers (#3)

Closes #2.

Co-authored-by: Richard Dern <richard@athaliasoft.com>
Reviewed-on: #3
Co-authored-by: Richard Dern <richard@noreply.git.athaliasoft.com>
Co-committed-by: Richard Dern <richard@noreply.git.athaliasoft.com>
pull/10/head 1.1
Richard Dern 2 months ago
parent
commit
189e9d61a6
  1. 8
      composer.json
  2. 858
      composer.lock
  3. 29
      config/analyzer-feed-discoverer.php
  4. 14
      config/analyzer-html.php
  5. 4
      config/analyzer.php
  6. 167
      package/Analyzers/FeedDiscoverer.php
  7. 200
      package/Analyzers/HtmlAnalyzer.php
  8. 41
      package/Concerns/HasLinkTags.php
  9. 43
      package/Concerns/HasMetaTags.php
  10. 26
      package/Concerns/HasTitle.php
  11. 20
      package/Facades/FeedDiscoverer.php
  12. 20
      package/Facades/HtmlAnalyzer.php
  13. 110
      package/Models/LinkTag.php
  14. 47
      package/Models/MetaTag.php
  15. 4
      package/Providers/AnalyzerServiceProvider.php
  16. 37
      package/Providers/FeedDiscovererServiceProvider.php
  17. 37
      package/Providers/HtmlAnalyzerServiceProvider.php
  18. 6
      tests/TestCase.php

8
composer.json

@ -11,8 +11,8 @@
"require": {
"php": "^7.3|^8.0",
"illuminate/support": "^8.0",
"cyca/helpers": "^1.0",
"cyca/fetcher": "^1.0",
"cyca/helpers": "^1.1",
"cyca/fetcher": "^1.1",
"neitanod/forceutf8": "^2.0"
},
"require-dev": {
@ -37,7 +37,9 @@
"extra": {
"laravel": {
"providers": [
"Cyca\\Analyzer\\Providers\\AnalyzerServiceProvider"
"Cyca\\Analyzer\\Providers\\AnalyzerServiceProvider",
"Cyca\\Analyzer\\Providers\\FeedDiscovererServiceProvider",
"Cyca\\Analyzer\\Providers\\HtmlAnalyzerServiceProvider"
],
"aliases": {
"Analyzer": "Cyca\\Analyzer\\Facades\\Analyzer"

858
composer.lock

File diff suppressed because it is too large

29
config/analyzer-feed-discoverer.php

@ -0,0 +1,29 @@
<?php
return [
/**
* Array of URLs to try to find feeds. Sometimes, websites don't declare
* feeds in their <link> tags, but they do provide feeds.
*
* URLs provided here will be tested, and if the server replies with a valid
* response, we'll add them to the list of discovered feeds.
*
* URLs specified here may be absolute (in case the feed could be served
* from a central provider) or relative (in which case, they will be
* resolved from document's url).
*/
'wellKnownUrls' => [
'/rss',
'/.rss',
'./rss',
'./.rss',
'/atom',
'/.atom',
'./atom',
'./.atom',
'/feed',
'/.feed',
'./feed',
'./.feed',
]
];

14
config/analyzer-html.php

@ -0,0 +1,14 @@
<?php
return [
/**
* MIME types of potential feeds. This data is provided by the "type"
* property of link tags.
*/
'feedMimeTypes' => [
'application/xml',
'text/xml',
'application/rss+xml',
'application/atom+xml',
],
];

4
config/analyzer.php

@ -34,8 +34,8 @@ return [
'html' => [
'groups' => [],
'analyzers' => [
\Cyca\Analyzer\Html\HtmlAnalyzer::class,
\Cyca\Analyzer\FeedDiscoverer\FeedDiscoverer::class
\Cyca\Analyzer\Analyzers\HtmlAnalyzer::class,
\Cyca\Analyzer\Analyzers\FeedDiscoverer::class
]
]

167
package/Analyzers/FeedDiscoverer.php

@ -0,0 +1,167 @@
<?php
declare(strict_types=1);
namespace Cyca\Analyzer\Analyzers;
use Cyca\Analyzer\Contracts\Analyzer;
use Cyca\Helpers\Helpers\StringHelper;
use Cyca\Fetcher\Facades\Fetcher;
use Str;
/**
* Discovers feeds
*/
class FeedDiscoverer extends BaseAnalyzer implements Analyzer
{
# --------------------------------------------------------------------------
# ----| Properties |--------------------------------------------------------
# --------------------------------------------------------------------------
/**
* The attributes that should be visible in serialization.
*
* @var array
*/
protected $visible = [];
/**
* The attributes that are mass assignable.
*
* @var string[]
*/
protected $fillable = [];
/**
* The attributes that should be cast.
*
* @var array
*/
protected $casts = [];
protected bool $fastMode = false;
# --------------------------------------------------------------------------
# ----| Methods |-----------------------------------------------------------
# --------------------------------------------------------------------------
/**
* Launch document's analyzis.
*
* @param bool $fastMode
* @return void
*/
public function analyze(bool $fastMode = false): void
{
$this->fastMode = $fastMode;
$this->findCandidates()->testCandidates();
}
/**
* Use various means to discover feed URLs from parent document.
*
* Chainable method.
*
* @return self
*/
protected function findCandidates(): self
{
return $this->findFeedUrlsInLinkTags()
->findFeedsInWellKnownUrls();
}
/**
* Go through every <link> tag already discovered by other analyzers, and
* check if they might declare a feed URL.
*
* Chainable method.
*
* @return self
*/
protected function findFeedUrlsInLinkTags(): self
{
$linkTags = data_get(
$this->baseAttributes,
'analyzers.*.linkTags'
);
$potentialFeeds = collect($linkTags)
->flatten(1)
->where('couldBeFeed', true)
->pluck('href')
->all();
$this->setAttribute('fromLinkTags', $potentialFeeds);
return $this;
}
/**
* Resolve any URLs in configuration and add them to the candidates.
*
* Chainable method.
*
* @return self
*/
protected function findFeedsInWellKnownUrls(): self
{
$documentUrl = data_get(
$this->baseAttributes,
'sourceUrl'
);
if (empty($documentUrl)) {
return $this;
}
$potentialFeeds = [];
foreach (config('analyzer-feed-discoverer.wellKnownUrls') as $url) {
$fullUrl = (new StringHelper($url))->resolveUrl(Str::finish($documentUrl, '/'));
if (!in_array($fullUrl, $potentialFeeds)) {
$potentialFeeds[] = $fullUrl;
}
}
$this->setAttribute('fromWellKnownUrls', $potentialFeeds);
return $this;
}
/**
* Check if candidate URLs can be reached.
*
* @return void
*/
protected function testCandidates(): void
{
if ($this->fastMode) {
return;
}
$candidates = collect(
array_merge(
$this->fromLinkTags,
$this->fromWellKnownUrls
)
)->unique();
$reachables = [];
$nonReachables = [];
foreach ($candidates as $url) {
$response = Fetcher::fetch($url);
if ($response->success) {
$reachables[] = $response->realUrl;
} else {
$nonReachables[] = $url;
}
}
$this->setAttribute('reachables', $reachables);
$this->setAttribute('nonReachables', $nonReachables);
}
}

200
package/Analyzers/HtmlAnalyzer.php

@ -0,0 +1,200 @@
<?php
declare(strict_types=1);
namespace Cyca\Analyzer\Analyzers;
use Cyca\Analyzer\Contracts\Analyzer;
use Cyca\Analyzer\Concerns\HasLinkTags;
use Cyca\Analyzer\Concerns\HasMetaTags;
use Cyca\Analyzer\Concerns\HasTitle;
use Cyca\Helpers\Helpers\StringHelper;
use DomDocument;
use DOMElement;
use DOMNode;
use DOMNodeList;
use DOMXPath;
use Exception;
use Illuminate\Support\Arr;
use Illuminate\Support\Str;
use League\Flysystem\FileNotFoundException;
/**
* Analyzes HTML files
*/
class HtmlAnalyzer extends BaseAnalyzer implements Analyzer
{
use
HasTitle,
HasMetaTags,
HasLinkTags;
# --------------------------------------------------------------------------
# ----| Properties |--------------------------------------------------------
# --------------------------------------------------------------------------
/**
* The attributes that should be visible in serialization.
*
* @var array
*/
protected $visible = [
'title',
'metaTags',
'linkTags'
];
/**
* The attributes that are mass assignable.
*
* @var string[]
*/
protected $fillable = [
'title',
'metaTags',
'linkTags'
];
/**
* The attributes that should be cast.
*
* @var array
*/
protected $casts = [
'metaTags' => 'array',
'linkTags' => 'array'
];
/**
* Document's body
*/
protected ?string $body = null;
/**
* Provides temporary access to DOM document to analyzers.
*
* @var DOMDocument
*/
protected ?DOMDocument $domDocument = null;
# --------------------------------------------------------------------------
# ----| Methods |-----------------------------------------------------------
# --------------------------------------------------------------------------
/**
* Launch document's analyzis.
*
* @param bool $fastMode
* @return void
*/
public function analyze(bool $fastMode = false): void
{
$this->loadDocumentBody()
->createDomDocument()
->findTitle()
->findMetaTags();
if (!$fastMode) {
$this->findLinkTags();
}
}
/**
* Load document's body into a variable.
*
* Chainable method.
*
* @return self
*/
protected function loadDocumentBody(): self
{
$filePath = Arr::get($this->baseAttributes, 'sourcePath');
if (empty($filePath)) {
throw new Exception("There is no body to analyze");
}
if (!$this->fileSystem->exists($filePath)) {
throw new FileNotFoundException($filePath);
}
$this->body = $this->fileSystem->get($filePath);
return $this;
}
/**
* Create a DOM document from document's body.
*
* Chainable method.
*
* @return self
*/
protected function createDomDocument(): self
{
$this->body = mb_convert_encoding($this->body, 'HTML-ENTITIES', 'UTF-8');
libxml_use_internal_errors(true);
$this->domDocument = new DomDocument('1.0', 'UTF-8');
$this->domDocument->loadHtml($this->body);
libxml_clear_errors();
return $this;
}
/**
* Find nodes corresponding to specified XPath query.
*
* @param string $xpathQuery
*
* @return DOMNodeList|false
*/
protected function findNodes(string $xpathQuery): DOMNodeList|false
{
$xpath = new DOMXPath($this->domDocument);
return $xpath->query($xpathQuery);
}
/**
* Find first node corresponding to specified XPath query.
*
* @param string $xpathQuery
*
* @return DomNode|null
*/
protected function findFirstNode($xpathQuery): ?DOMNode
{
$nodes = $this->findNodes($xpathQuery);
if ($nodes->length === 0) {
return null;
}
return $nodes->item(0);
}
/**
* Place in an array all attributes of a specific DOMElement.
*
* @param DOMElement $node
*
* @return array
*/
protected function domElementToArray(DOMElement $node): array
{
$data = [];
foreach ($node->attributes as $attribute) {
$key = Str::slug($attribute->localName);
$value = (new StringHelper($attribute->nodeValue))->cleanup();
$data[$key] = $value;
}
return $data;
}
}

41
package/Concerns/HasLinkTags.php

@ -0,0 +1,41 @@
<?php
declare(strict_types=1);
namespace Cyca\Analyzer\Concerns;
use Cyca\Analyzer\Models\LinkTag;
trait HasLinkTags
{
/**
* Array of <link> tags.
*
* @var array
*/
protected array $linkTagsArray = [];
/**
* Find and parse link tags.
*/
protected function findLinkTags()
{
$nodes = $this->findNodes('//head/link');
$this->linkTagsArray = [];
foreach ($nodes as $node) {
$data = $this->domElementToArray($node);
$link = new LinkTag();
$link->setBaseAttributes($this->baseAttributes)
->forceFill($data);
$this->linkTagsArray[] = $link;
}
$this->setAttribute('linkTags', collect($this->linkTagsArray)->sortKeys()->all());
return $this;
}
}

43
package/Concerns/HasMetaTags.php

@ -0,0 +1,43 @@
<?php
declare(strict_types=1);
namespace Cyca\Analyzer\Concerns;
use Cyca\Analyzer\Models\MetaTag;
trait HasMetaTags
{
/**
* Array of <meta> tags.
*
* @var array
*/
protected array $metaTagsArray = [];
/**
* Find document's <meta> tags.
*
* @return self
*/
protected function findMetaTags(): self
{
$this->metaTagsArray = [];
$nodes = $this->findNodes('//head/meta');
foreach ($nodes as $node) {
$data = $this->domElementToArray($node);
$meta = new MetaTag();
$meta->setBaseAttributes($this->baseAttributes)
->forceFill($data);
$this->metaTagsArray[] = $meta;
}
$this->setAttribute('metaTags', $this->metaTagsArray);
return $this;
}
}

26
package/Concerns/HasTitle.php

@ -0,0 +1,26 @@
<?php
declare(strict_types=1);
namespace Cyca\Analyzer\Concerns;
use Cyca\Helpers\Helpers\StringHelper;
trait HasTitle
{
/**
* Find document's title.
*
* @return self
*/
protected function findTitle(): self
{
$node = $this->findFirstNode('//head/title');
if (!empty($node)) {
$this->setAttribute('title', (new StringHelper($node->nodeValue))->cleanup());
}
return $this;
}
}

20
package/Facades/FeedDiscoverer.php

@ -0,0 +1,20 @@
<?php
declare(strict_types=1);
namespace Cyca\Analyzer\Facades;
use Illuminate\Support\Facades\Facade;
class FeedDiscoverer extends Facade
{
/**
* Get the registered name of the component.
*
* @return string
*/
protected static function getFacadeAccessor()
{
return 'FeedDiscoverer';
}
}

20
package/Facades/HtmlAnalyzer.php

@ -0,0 +1,20 @@
<?php
declare(strict_types=1);
namespace Cyca\Analyzer\Facades;
use Illuminate\Support\Facades\Facade;
class HtmlAnalyzer extends Facade
{
/**
* Get the registered name of the component.
*
* @return string
*/
protected static function getFacadeAccessor()
{
return 'HtmlAnalyzer';
}
}

110
package/Models/LinkTag.php

@ -0,0 +1,110 @@
<?php
declare(strict_types=1);
namespace Cyca\Analyzer\Models;
use Cyca\Helpers\Helpers\GenericObject;
use Cyca\Helpers\Helpers\StringHelper;
class LinkTag extends GenericObject
{
/**
* The accessors to append to the model's array form.
*
* @var array
*/
protected $appends = ['couldBeFeed'];
/**
* Base analyzer's attributes. Useful to access properties such as
* document's URL.
*
* @var array
*/
protected array $baseAttributes = [];
/**
* Return a boolean value indicating if this link tag could link to a feed.
*
* @return boolean
*/
public function getCouldBeFeedAttribute(): bool
{
if (!array_key_exists('couldBeFeed', $this->attributes)) {
$this->attributes['couldBeFeed'] = $this->checkIfCouldBeFeed();
}
return $this->attributes['couldBeFeed'];
}
/**
* Set base analyzer's attributes.
*
* Chainable method.
*
* @param array $attributes
* @return self
*/
public function setBaseAttributes(array $attributes): self
{
$this->baseAttributes = $attributes;
return $this;
}
/**
* Define value for the "title" field.
*
* @param string $value
* @return void
*/
public function setTitleAttribute(string $value)
{
$value = (new StringHelper($value))->cleanup();
$this->attributes['title'] = $value;
}
/**
* Define value for the "href" field.
*
* @param string $value
* @return void
*/
public function setHrefAttribute(string $value)
{
$baseUrl = data_get($this->baseAttributes, 'sourceUrl');
if (!empty($baseUrl)) {
$value = (new StringHelper($value))->resolveUrl($baseUrl);
}
$this->attributes['href'] = $value;
}
/**
* Return a boolean value indicating if this link tag could link to a feed.
*
* @return boolean
*/
protected function checkIfCouldBeFeed(): bool
{
if (empty($this->attributes['href'])) {
return false;
}
if (empty($this->attributes['rel']) || $this->attributes['rel'] !== 'alternate') {
return false;
}
if (
empty($this->attributes['type']) ||
!in_array($this->attributes['type'], config('analyzer-html.feedMimeTypes'))
) {
return false;
}
return true;
}
}

47
package/Models/MetaTag.php

@ -0,0 +1,47 @@
<?php
declare(strict_types=1);
namespace Cyca\Analyzer\Models;
use Cyca\Helpers\Helpers\GenericObject;
use Cyca\Helpers\Helpers\StringHelper;
class MetaTag extends GenericObject
{
/**
* Base analyzer's attributes. Useful to access properties such as
* document's URL.
*
* @var array
*/
protected array $baseAttributes = [];
/**
* Set base analyzer's attributes.
*
* Chainable method.
*
* @param array $attributes
* @return self
*/
public function setBaseAttributes(array $attributes): self
{
$this->baseAttributes = $attributes;
return $this;
}
/**
* Define value for the "content" field.
*
* @param string $value
* @return void
*/
public function setContentAttribute(string $value)
{
$value = (new StringHelper($value))->cleanup();
$this->attributes['content'] = $value;
}
}

4
package/Providers/AnalyzerServiceProvider.php

@ -17,6 +17,8 @@ class AnalyzerServiceProvider extends ServiceProvider
*/
public function register()
{
$this->mergeConfigFrom(__DIR__ . '/../../config/analyzer.php', 'analyzer');
$this->app->bind('Analyzer', function ($app) {
$client = new Analyzer();
@ -35,6 +37,6 @@ class AnalyzerServiceProvider extends ServiceProvider
{
$this->publishes([
__DIR__ . '/../../config/analyzer.php' => config_path('analyzer.php'),
]);
], 'config');
}
}

37
package/Providers/FeedDiscovererServiceProvider.php

@ -0,0 +1,37 @@
<?php
declare(strict_types=1);
namespace Cyca\Analyzer\Providers;
use Cyca\Analyzer\Analyzers\FeedDiscoverer;
use Illuminate\Support\ServiceProvider;
class FeedDiscovererServiceProvider extends ServiceProvider
{
/**
* Register any application services.
*
* @return void
*/
public function register()
{
$this->mergeConfigFrom(__DIR__ . '/../../config/analyzer-feed-discoverer.php', 'analyzer-feed-discoverer');
$this->app->bind('FeedDiscoverer', function ($app) {
return new FeedDiscoverer();
});
}
/**
* Bootstrap any application services.
*
* @return void
*/
public function boot()
{
$this->publishes([
__DIR__ . '/../../config/analyzer-feed-discoverer.php' => config_path('analyzer-feed-discoverer.php'),
], 'config');
}
}

37
package/Providers/HtmlAnalyzerServiceProvider.php

@ -0,0 +1,37 @@
<?php
declare(strict_types=1);
namespace Cyca\Analyzer\Providers;
use Cyca\Analyzer\Analyzers\HtmlAnalyzer;
use Illuminate\Support\ServiceProvider;
class HtmlAnalyzerServiceProvider extends ServiceProvider
{
/**
* Register any application services.
*
* @return void
*/
public function register()
{
$this->mergeConfigFrom(__DIR__ . '/../../config/analyzer-html.php', 'analyzer-html');
$this->app->bind('HtmlAnalyzer', function ($app) {
return new HtmlAnalyzer();
});
}
/**
* Bootstrap any application services.
*
* @return void
*/
public function boot()
{
$this->publishes([
__DIR__ . '/../../config/analyzer-html.php' => config_path('analyzer-html.php'),
], 'config');
}
}

6
tests/TestCase.php

@ -12,8 +12,6 @@ class TestCase extends \Orchestra\Testbench\TestCase
*/
protected function getEnvironmentSetup($app)
{
$app['config']->set('fetcher', include(__DIR__ . '/../vendor/cyca/fetcher/config/fetcher.php'));
$app['config']->set('fetcher.cache_store', 'array');
}
/**
@ -27,8 +25,10 @@ class TestCase extends \Orchestra\Testbench\TestCase
{
return [
'Cyca\Analyzer\Providers\AnalyzerServiceProvider',
'Cyca\Analyzer\Providers\FeedDiscovererServiceProvider',
'Cyca\Analyzer\Providers\HtmlAnalyzerServiceProvider',
'Cyca\Fetcher\Providers\FetcherServiceProvider',
'Cyca\Fetcher\Http\Providers\HttpFetcherServiceProvider',
'Cyca\Fetcher\Providers\HttpServiceProvider',
];
}
}

Loading…
Cancel
Save