Browse Source

20210813-better-feeds-discovery (#8)

Co-authored-by: Richard Dern <richard@athaliasoft.com>
Reviewed-on: #8
Co-authored-by: Richard Dern <richard@noreply.git.athaliasoft.com>
Co-committed-by: Richard Dern <richard@noreply.git.athaliasoft.com>
pull/10/head 1.3.0
Richard Dern 2 months ago
parent
commit
81d54c1ae1
  1. 46
      config/analyzer-feed-discoverer.php
  2. 2
      package/Analyzer.php
  3. 25
      package/Analyzers/FeedDiscoverer.php
  4. 3
      phpunit.xml
  5. 32
      tests/Feature/FeedDiscovererTest.php

46
config/analyzer-feed-discoverer.php

@ -2,28 +2,32 @@
return [
/**
* Array of URLs to try to find feeds. Sometimes, websites don't declare
* feeds in their <link> tags, but they do provide feeds.
* Instruct feed discoverer to verify discovered feeds. If this is set to
* false, analyzer will simply return an array of discovered feeds URLs. If
* set to true, analyzer will also try to reach them, which will increase
* analyzis duration.
*
* URLs provided here will be tested, and if the server replies with a valid
* response, we'll add them to the list of discovered feeds.
* Regardless of this setting, feeds availability will be checked during
* feeds update.
*
* URLs specified here may be absolute (in case the feed could be served
* from a central provider) or relative (in which case, they will be
* resolved from document's url).
* Default value: false
*/
'wellKnownUrls' => [
'/rss',
'/.rss',
'./rss',
'./.rss',
'/atom',
'/.atom',
'./atom',
'./.atom',
'/feed',
'/.feed',
'./feed',
'./.feed',
]
'verifyFeeds' => env('ANALYZER_VERIFY_FEEDS', false),
/**
* Known URLs to find feeds. This will force add specified URLs to the list
* of discovered feeds, even if they are not explicitely declared by the
* remote host.
*/
'knownUrls' => [
/**
* Match by domain. Keys of this array are used as regex, so take care
* to escape dots and other special characters.
*/
'byDomain' => [
'/(\.)?reddit\.com$/i' => [
'.rss'
]
],
],
];

2
package/Analyzer.php

@ -274,7 +274,7 @@ class Analyzer extends GenericObject implements Cachable
$attributes['analyzers'][$className] = $instance->toArray();
}
$this->setAttribute('analyzers', $attributes['analyzers']);
$this->setAttribute('analyzers', data_get($attributes, 'analyzers', []));
}
/**

25
package/Analyzers/FeedDiscoverer.php

@ -117,15 +117,28 @@ class FeedDiscoverer extends BaseAnalyzer implements Analyzer
$potentialFeeds = [];
foreach (config('analyzer-feed-discoverer.wellKnownUrls') as $url) {
$fullUrl = (new StringHelper($url))->resolveUrl(Str::finish($documentUrl, '/'));
foreach (config('analyzer-feed-discoverer.knownUrls.byDomain') as $regex => $urls) {
$domain = data_get($this->baseAttributes, 'fetcher.idn');
if (!in_array($fullUrl, $potentialFeeds)) {
$potentialFeeds[] = $fullUrl;
if (empty($domain)) {
continue;
}
if (!preg_match($regex, $domain)) {
dump($domain);
continue;
}
foreach ($urls as $url) {
$fullUrl = (new StringHelper($url))->resolveUrl(Str::finish($documentUrl, '/'));
if (!in_array($fullUrl, $potentialFeeds)) {
$potentialFeeds[] = $fullUrl;
}
}
}
$this->setAttribute('fromWellKnownUrls', $potentialFeeds);
$this->setAttribute('knownUrls', $potentialFeeds);
return $this;
}
@ -137,7 +150,7 @@ class FeedDiscoverer extends BaseAnalyzer implements Analyzer
*/
protected function testCandidates(): void
{
if ($this->fastMode) {
if ($this->fastMode || !config('analyzer-feed-discoverer.verifyFeeds')) {
return;
}

3
phpunit.xml

@ -8,6 +8,9 @@
<testsuite name="Unit">
<directory suffix="Test.php">./tests/Unit</directory>
</testsuite>
<testsuite name="Feature">
<directory suffix="Test.php">./tests/Feature</directory>
</testsuite>
</testsuites>
<coverage processUncoveredFiles="true">
<include>

32
tests/Feature/FeedDiscovererTest.php

@ -0,0 +1,32 @@
<?php
namespace Cyca\Analyzer\Tests\Feature;
use Cyca\Analyzer\Facades\Analyzer;
use Cyca\Analyzer\Tests\TestCase;
use Illuminate\Support\Facades\Http;
class FeedDiscovererTest extends TestCase
{
public function test_reddit_feed_discovery()
{
$url = 'www.reddit.com/r/worldnews/';
$expectedFeedUrl = 'https://www.reddit.com/r/worldnews/.rss';
Http::fake([
$url => Http::response('<html><body>Success!</body></html>', 200)
]);
$analyzer = Analyzer::analyzeUrl('https://' . $url);
$this->assertTrue(
in_array(
$expectedFeedUrl,
data_get(
$analyzer->toArray(),
'analyzers.Cyca\Analyzer\Analyzers\FeedDiscoverer.knownUrls'
)
)
);
}
}
Loading…
Cancel
Save