diff --git a/bin/crawl.php b/bin/crawl.php index c42578a..f86c263 100755 --- a/bin/crawl.php +++ b/bin/crawl.php @@ -25,6 +25,7 @@ $urls = extract_urls_recursively($url, $depth, 'extract_unique_urls'); break; case 'worker': + predis()->del('visited'); $urls = extract_urls_with_worker($url, $depth, uniqid()); break; case 'promise': diff --git a/bin/worker.php b/bin/worker.php index f12783e..48279a7 100755 --- a/bin/worker.php +++ b/bin/worker.php @@ -23,7 +23,7 @@ function blpop(string $key): string { // upravit kód tak, aby nedocházelo ke ztrátě dat // if (rand(0,1)) exit(1); - foreach (extract_urls($url) as $u) { + foreach (worker_extract_unique_urls($url) as $u) { predis()->rpush($results, json_encode($u)); if ($depth > 0) { diff --git a/src/functions.php b/src/functions.php index e14f379..5838d07 100644 --- a/src/functions.php +++ b/src/functions.php @@ -51,6 +51,15 @@ function predis(): Predis\Client { return $client ?? $client = new Predis\Client('tcp://redis'); } +// extrahuje pouze unikátní URL ze zadané adresy +function worker_extract_unique_urls(string $url): Generator { + foreach (extract_urls($url) as $u) { + if (predis()->hsetnx('visited', md5($u), 1)) { + yield $u; + } + } +} + // přidá zadanou adresu do fronty pro zpracování workerem function extract_urls_with_worker(string $url, int $depth, string $results): Generator { predis()->rpush('queue', json_encode([$url, $depth, $results]));