diff --git a/app/Jobs/Searcher.php b/app/Jobs/Searcher.php index eb253a325b08cbcd645086cc7bc6db17a90c10c6..2d5d7393759ae81a483cfc789ac6f0ac69c9b027 100644 --- a/app/Jobs/Searcher.php +++ b/app/Jobs/Searcher.php @@ -3,18 +3,16 @@ namespace App\Jobs; use Illuminate\Bus\Queueable; -use Illuminate\Queue\SerializesModels; -use Illuminate\Queue\InteractsWithQueue; use Illuminate\Contracts\Queue\ShouldQueue; -use Illuminate\Foundation\Bus\DispatchesJobs; +use Illuminate\Queue\InteractsWithQueue; +use Illuminate\Queue\SerializesModels; use Illuminate\Support\Facades\Redis; -use Log; class Searcher implements ShouldQueue { use InteractsWithQueue, Queueable, SerializesModels; - protected $name, $ch, $pid, $counter, $lastTime, $connectionInfo; + protected $name, $ch, $pid, $counter, $lastTime, $connectionInfo, $user, $password; # Each Searcher will shutdown after a specified time(s) or number of requests protected $MAX_REQUESTS = 100; # This value should always be below the retry_after value in config/queue.php @@ -34,12 +32,14 @@ class Searcher implements ShouldQueue * keep-alive requests. * @return void */ - public function __construct($name) + public function __construct($name, $user = null, $password = null) { $this->name = $name; $this->pid = getmypid(); $this->recheck = false; $this->startTime = microtime(true); + $this->user = $user; + $this->password = $password; // Submit this worker to the Redis System Redis::expire($this->name, 5); } @@ -53,95 +53,101 @@ class Searcher implements ShouldQueue { // This Searches is freshly called so we need to initialize the curl handle $ch $this->ch = $this->initCurlHandle(); - $this->counter = 0; // Counts the number of answered jobs - $time = microtime(true); - while(true){ - // Update the expire - Redis::expire($this->name, 5); - Redis::expire($this->name . ".stats", 5); - // One Searcher can handle a ton of requests to the same server - // Each search to the server of this Searcher will be submitted to a queue - // stored in redis which has the same name as this searchengine appended by a ".queue" - // We will perform a blocking pop on this queue so the queue can remain empty for a while - // without killing this searcher directly. - $mission = Redis::blpop($this->name . ".queue", 4); - $this->counter++; - $this->updateStats(microtime(true) - $time); - $this->switchToRunning(); - // The mission can be empty when blpop hit the timeout - if(!empty($mission)){ - $mission = $mission[1]; - $poptime = microtime(true) - $time; - - // The mission is a String which can be divided to retrieve three informations: - // 1. The Hash Value where the result should be stored - // 2. The Url to Retrieve - // 3. The maximum time to take - // These three informations are divided by a ";" in the mission string - $mission = explode(";", $mission); - $hashValue = $mission[0]; // The hash value for redis to store the results under - $url = base64_decode($mission[1]); // The url to fetch - $timeout = $mission[2]; // Timeout from the MetaGer process in ms - $medianFetchTime = $this->getFetchTime(); // The median Fetch time of the search engine in ms - Redis::hset('search.' . $hashValue, $this->name, "connected"); - - $result = $this->retrieveUrl($url); - - $this->storeResult($result, $poptime, $hashValue); - - // Reset the time of the last Job so we can calculate - // the time we have spend waiting for a new job - // We submit that calculation to the Redis systemin the method - $time = microtime(true); + try { + $this->counter = 0; // Counts the number of answered jobs + $time = microtime(true); + while (true) { + // Update the expire + Redis::expire($this->name, 5); + Redis::expire($this->name . ".stats", 5); + // One Searcher can handle a ton of requests to the same server + // Each search to the server of this Searcher will be submitted to a queue + // stored in redis which has the same name as this searchengine appended by a ".queue" + // We will perform a blocking pop on this queue so the queue can remain empty for a while + // without killing this searcher directly. + $mission = Redis::blpop($this->name . ".queue", 4); + $this->counter++; + $this->updateStats(microtime(true) - $time); + $this->switchToRunning(); + // The mission can be empty when blpop hit the timeout + if (!empty($mission)) { + $mission = $mission[1]; + $poptime = microtime(true) - $time; + + // The mission is a String which can be divided to retrieve three informations: + // 1. The Hash Value where the result should be stored + // 2. The Url to Retrieve + // 3. The maximum time to take + // These three informations are divided by a ";" in the mission string + $mission = explode(";", $mission); + $hashValue = $mission[0]; // The hash value for redis to store the results under + $url = base64_decode($mission[1]); // The url to fetch + $timeout = $mission[2]; // Timeout from the MetaGer process in ms + $medianFetchTime = $this->getFetchTime(); // The median Fetch time of the search engine in ms + Redis::hset('search.' . $hashValue, $this->name, "connected"); + + $result = $this->retrieveUrl($url); + + $this->storeResult($result, $poptime, $hashValue); + + // Reset the time of the last Job so we can calculate + // the time we have spend waiting for a new job + // We submit that calculation to the Redis systemin the method + $time = microtime(true); + } + + // In sync mode every Searcher may only retrieve one result because it would block + // the execution of the remaining code otherwise: + if (getenv("QUEUE_DRIVER") === "sync" + || $this->counter > $this->MAX_REQUESTS + || (microtime(true) - $this->startTime) > $this->MAX_TIME) { + break; + } } - - // In sync mode every Searcher may only retrieve one result because it would block - // the execution of the remaining code otherwise: - if(getenv("QUEUE_DRIVER") === "sync" - || $this->counter > $this->MAX_REQUESTS - || (microtime(true)-$this->startTime) > $this->MAX_TIME){ - break; - } + } finally { + // When we reach this point, time has come for this Searcher to retire + $this->shutdown(); } - // When we reach this point, time has come for this Searcher to retire - $this->shutdown(); } - private function switchToRunning(){ + private function switchToRunning() + { /** - * When a Searcher is initially started the redis value for $this->name is set to "locked" - * which effectively will prevent new Searchers of this type to be started. (Value is checked by the MetaGer process which starts the Searchers) - * This is done so the MetaGer processes won't start hundreds of Searchers parallely when under high work load. - * It will force that Searchers can only be started one after the other. - * When a new Searcher has served a minimum of three requests we have enough data to decide whether we need even more Searchers. - * To do so we will then set the redis value for $this->name to "running". - * There is a case where we don't want new Searchers to be started even if we would need to do so to serve every Request: - * When a search engine needs more time to produce search results than the timeout of the MetaGer process, we won't even bother of spawning - * more and more Searchers because they would just block free worker processes from serving the important engines which will give results in time. - **/ - if($this->counter === 3 || getenv("QUEUE_DRIVER") === "sync"){ + * When a Searcher is initially started the redis value for $this->name is set to "locked" + * which effectively will prevent new Searchers of this type to be started. (Value is checked by the MetaGer process which starts the Searchers) + * This is done so the MetaGer processes won't start hundreds of Searchers parallely when under high work load. + * It will force that Searchers can only be started one after the other. + * When a new Searcher has served a minimum of three requests we have enough data to decide whether we need even more Searchers. + * To do so we will then set the redis value for $this->name to "running". + * There is a case where we don't want new Searchers to be started even if we would need to do so to serve every Request: + * When a search engine needs more time to produce search results than the timeout of the MetaGer process, we won't even bother of spawning + * more and more Searchers because they would just block free worker processes from serving the important engines which will give results in time. + **/ + if ($this->counter === 3 || getenv("QUEUE_DRIVER") === "sync") { # If the MetaGer process waits longer for the results than this Fetcher will probably need to fetch # Or if this engine is in the array of important engines which we will always try to serve Redis::set($this->name, "running"); $this->recheck = false; } } - private function updateStats($poptime){ - if($this->connectionInfo !== NULL){ + private function updateStats($poptime) + { + if ($this->connectionInfo !== null) { $connectionInfo = base64_encode(json_encode($this->connectionInfo)); Redis::hset($this->name . ".stats", $this->pid, $connectionInfo . ";" . $poptime); } } - private function getFetchTime(){ + private function getFetchTime() + { $vals = Redis::hgetall($this->name . ".stats"); - if(sizeof($vals) === 0){ + if (sizeof($vals) === 0) { return 0; - }else{ + } else { $totalTime = 0; foreach ($vals as $pid => $value) { $time = floatval(json_decode(base64_decode(explode(";", $value)[0]), true)["total_time"]); - $time *= 1000; // Transform from seconds to milliseconds + $time *= 1000; // Transform from seconds to milliseconds $totalTime += $time; } $totalTime /= sizeof($vals); @@ -149,7 +155,8 @@ class Searcher implements ShouldQueue } } - private function retrieveUrl($url){ + private function retrieveUrl($url) + { // Set this URL to the Curl handle curl_setopt($this->ch, CURLOPT_URL, $url); $result = curl_exec($this->ch); @@ -157,36 +164,43 @@ class Searcher implements ShouldQueue return $result; } - private function storeResult($result, $poptime, $hashValue){ + private function storeResult($result, $poptime, $hashValue) + { Redis::hset('search.' . $hashValue, $this->name, $result); // After 60 seconds the results should be read by the MetaGer Process and stored in the Cache instead Redis::expire('search.' . $hashValue, 60); $this->lastTime = microtime(true); } - private function shutdown(){ + private function shutdown() + { Redis::hdel($this->name . ".stats", $this->pid); - if(sizeof(Redis::hgetall($this->name . ".stats")) === 0){ + if (sizeof(Redis::hgetall($this->name . ".stats")) === 0) { Redis::del($this->name); } // We should close our curl handle before we do so curl_close($this->ch); } - private function initCurlHandle(){ + private function initCurlHandle() + { $ch = curl_init(); curl_setopt_array($ch, array( - CURLOPT_RETURNTRANSFER => 1, - CURLOPT_USERAGENT => "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1", - CURLOPT_FOLLOWLOCATION => TRUE, - CURLOPT_CONNECTTIMEOUT => 10, - CURLOPT_MAXCONNECTS => 500, - CURLOPT_LOW_SPEED_LIMIT => 500, - CURLOPT_LOW_SPEED_TIME => 5, - CURLOPT_TIMEOUT => 10 + CURLOPT_RETURNTRANSFER => 1, + CURLOPT_USERAGENT => "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1", + CURLOPT_FOLLOWLOCATION => true, + CURLOPT_CONNECTTIMEOUT => 10, + CURLOPT_MAXCONNECTS => 500, + CURLOPT_LOW_SPEED_LIMIT => 500, + CURLOPT_LOW_SPEED_TIME => 5, + CURLOPT_TIMEOUT => 10, )); + if ($this->user !== null && $this->password !== null) { + curl_setopt($ch, CURLOPT_USERPWD, $this->user . ":" . $this->password); + } + return $ch; } } diff --git a/app/Models/Searchengine.php b/app/Models/Searchengine.php index dff637a18b9a9f2f4a5a9ed61f5f530c1e98eaa0..e537a164942d4e6811453fcdcbeac81c2acd2138 100644 --- a/app/Models/Searchengine.php +++ b/app/Models/Searchengine.php @@ -15,12 +15,12 @@ abstract class Searchengine public $ch; # Curl Handle zum erhalten der Ergebnisse public $getString = ""; # Der String für die Get-Anfrage public $engine; # Die ursprüngliche Engine XML - public $enabled = true; # true, wenn die Suchmaschine nicht explizit disabled ist - public $results = []; # Die geladenen Ergebnisse - public $ads = []; # Die geladenen Werbungen + public $enabled = true; # true, wenn die Suchmaschine nicht explizit disabled ist + public $results = []; # Die geladenen Ergebnisse + public $ads = []; # Die geladenen Werbungen public $products = []; # Die geladenen Produkte - public $loaded = false; # wahr, sobald die Ergebnisse geladen wurden - public $cached = false; + public $loaded = false; # wahr, sobald die Ergebnisse geladen wurden + public $cached = false; public $ip; # Die IP aus der metager public $uses; # Die Anzahl der Nutzungen dieser Suchmaschine @@ -32,9 +32,9 @@ abstract class Searchengine public $hash; # Der Hash-Wert dieser Suchmaschine public $fp; # Wird für Artefakte benötigt - public $socketNumber = null; # Wird für Artefakte benötigt - public $counter = 0; # Wird eventuell für Artefakte benötigt - public $write_time = 0; # Wird eventuell für Artefakte benötigt + public $socketNumber = null; # Wird für Artefakte benötigt + public $counter = 0; # Wird eventuell für Artefakte benötigt + public $write_time = 0; # Wird eventuell für Artefakte benötigt public $connection_time = 0; # Wird eventuell für Artefakte benötigt public function __construct(\SimpleXMLElement $engine, MetaGer $metager) @@ -69,7 +69,7 @@ abstract class Searchengine } $this->useragent = $metager->getUserAgent(); - $this->ip = $metager->getIp(); + $this->ip = $metager->getIp(); $this->startTime = microtime(); # Suchstring generieren @@ -84,10 +84,10 @@ abstract class Searchengine } else { $q = $metager->getQ(); } - $this->getString = $this->generateGetString($q, $metager->getUrl(), $metager->getLanguage(), $metager->getCategory()); - $this->hash = md5($this->host . $this->getString . $this->port . $this->name); + $this->getString = $this->generateGetString($q, $metager->getUrl(), $metager->getLanguage(), $metager->getCategory()); + $this->hash = md5($this->host . $this->getString . $this->port . $this->name); $this->resultHash = $metager->getHashCode(); - $this->canCache = $metager->canCache(); + $this->canCache = $metager->canCache(); if (!isset($this->additionalHeaders)) {$this->additionalHeaders = "";} } @@ -114,13 +114,13 @@ abstract class Searchengine // With <ResultHash> being the Hash Value where the fetcher will store the result. // and <URL to fetch> being the full URL to the searchengine $url = ""; - if($this->port === "443"){ + if ($this->port === "443") { $url = "https://"; - }else{ + } else { $url = "http://"; } $url .= $this->host; - if($this->port !== 80 && $this->port !== 443){ + if ($this->port !== 80 && $this->port !== 443) { $url .= ":" . $this->port; } $url .= $this->getString; @@ -132,27 +132,27 @@ abstract class Searchengine Redis::rpush($this->name . ".queue", $mission); /** - * We have Searcher processes running for MetaGer - * Each Searcher is dedicated to one specific Searchengine and fetches it's results. - * We can have multiple Searchers for each engine, if needed. - * At this point we need to decide, whether we need to start a new Searcher process or - * if we have enough of them running. - * The information for that is provided through the redis system. Each running searcher - * gives information how long it has waited to be given the last fetcher job. - * The longer this time value is, the less frequent the search engine is used and the less - * searcher of that type we need. - * But if it's too low, i.e. 100ms, then the searcher is near to it's full workload and needs assistence. - **/ + * We have Searcher processes running for MetaGer + * Each Searcher is dedicated to one specific Searchengine and fetches it's results. + * We can have multiple Searchers for each engine, if needed. + * At this point we need to decide, whether we need to start a new Searcher process or + * if we have enough of them running. + * The information for that is provided through the redis system. Each running searcher + * gives information how long it has waited to be given the last fetcher job. + * The longer this time value is, the less frequent the search engine is used and the less + * searcher of that type we need. + * But if it's too low, i.e. 100ms, then the searcher is near to it's full workload and needs assistence. + **/ $needSearcher = false; $searcherData = Redis::hgetall($this->name . ".stats"); // We now have an array of statistical data from the searchers // Each searcher has one entry in it. - // So if it's empty, then we have currently no searcher running and + // So if it's empty, then we have currently no searcher running and // of course need to spawn a new one. - if(sizeof($searcherData) === 0){ + if (sizeof($searcherData) === 0) { $needSearcher = true; - }else{ + } else { // There we go: // There's at least one Fetcher running for this search engine. // Now we have to check if the current count is enough to fetch all the @@ -160,18 +160,18 @@ abstract class Searchengine // Let's hardcode a minimum of 100ms between every search job. // First calculate the median of all Times $median = 0; - foreach($searcherData as $pid => $data){ + foreach ($searcherData as $pid => $data) { $data = explode(";", $data); $median += floatval($data[1]); } $median /= sizeof($searcherData); - if($median < .1){ + if ($median < .1) { $needSearcher = true; } } - if($needSearcher && Redis::get($this->name) !== "locked"){ + if ($needSearcher && Redis::get($this->name) !== "locked") { Redis::set($this->name, "locked"); - $this->dispatch(new Searcher($this->name)); + $this->dispatch(new Searcher($this->name, $this->user, $this->password)); } } } @@ -203,7 +203,8 @@ abstract class Searchengine $this->enabled = true; } - public function setResultHash($hash){ + public function setResultHash($hash) + { $this->resultHash = $hash; } diff --git a/app/Models/parserSkripte/Fastbot.php b/app/Models/parserSkripte/Fastbot.php deleted file mode 100644 index 63a3a12c6e27f7239235df3f3a370b907dbe7359..0000000000000000000000000000000000000000 --- a/app/Models/parserSkripte/Fastbot.php +++ /dev/null @@ -1,49 +0,0 @@ -<?php - -namespace app\Models\parserSkripte; - -use App\Models\Searchengine; - -class Fastbot extends Searchengine -{ - public $results = []; - - public function __construct(\SimpleXMLElement $engine, \App\MetaGer $metager) - { - parent::__construct($engine, $metager); - if (strpos($this->urlEncode($metager->getEingabe()), "%") !== false) { - $this->enabled = false; - return null; - } - } - - public function loadResults($result) - { - $result = utf8_encode($result); - $counter = 0; - foreach (explode("\n", $result) as $line) { - $line = trim($line); - if (strlen($line) > 0) { - # Hier bekommen wir jedes einzelne Ergebnis - $result = explode("|:|", $line); - if(count($result) < 4) continue; - $link = $result[1]; - $link = substr($link, strpos($link, "href=\"") + 6); - $link = substr($link, 0, strpos($link, "\"")); - $counter++; - $this->gefVon = "<a href=\"" . $this->homepage . "\" target=\"_blank\" rel=\"noopener\">" . $this->displayName . " " . trans('results.redirect') . "</a>"; - $this->results[] = new \App\Models\Result( - $this->engine, - trim(strip_tags($result[1])), - $link, - $result[3], - $result[2], - $this->displayName,$this->homepage, - $counter - ); - } - - } - - } -} diff --git a/app/Models/parserSkripte/Scopia.php b/app/Models/parserSkripte/Scopia.php new file mode 100644 index 0000000000000000000000000000000000000000..b728de0bdd478007ac851ce5f8f7864bbe38cad3 --- /dev/null +++ b/app/Models/parserSkripte/Scopia.php @@ -0,0 +1,82 @@ +<?php + +namespace app\Models\parserSkripte; + +use App\Models\Searchengine; +use Log; + +class Scopia extends Searchengine +{ + public $results = []; + + public function __construct(\SimpleXMLElement $engine, \App\MetaGer $metager) + { + parent::__construct($engine, $metager); + } + + public function loadResults($result) + { + $result = html_entity_decode($result); + $result = str_replace("&", "&", $result); + try { + + $content = simplexml_load_string($result); + if (!$content) { + return; + } + + $results = $content->xpath('//results/result'); + foreach ($results as $result) { + $title = $result->title->__toString(); + $link = $result->url->__toString(); + $anzeigeLink = $link; + $descr = $result->description->__toString(); + $this->counter++; + $this->results[] = new \App\Models\Result( + $this->engine, + $title, + $link, + $anzeigeLink, + $descr, + $this->gefVon, + $this->counter + ); + } + } catch (\Exception $e) { + Log::error("A problem occurred parsing results from $this->name:"); + Log::error($e->getMessage()); + return; + } + } + + public function getNext(\App\MetaGer $metager, $result) + { + $result = html_entity_decode($result); + $result = str_replace("&", "&", $result); + try { + $content = simplexml_load_string($result); + + } catch (\Exception $e) { + Log::error("A problem occurred parsing results from $this->name:"); + Log::error($e->getMessage()); + return; + } + + if (!$content) { + return; + } + + $more = $content->xpath('//results/more')[0]->__toString() === "1" ? true : false; + + if ($more) { + $results = $content->xpath('//results/result'); + $number = $results[sizeof($results) - 1]->number->__toString(); + # Erstellen des neuen Suchmaschinenobjekts und anpassen des GetStrings: + $next = new Scopia(simplexml_load_string($this->engine), $metager); + $next->getString = preg_replace("/\\?s=.*?&/si", "?s=" . $number, $next->getString); + $next->hash = md5($next->host . $next->getString . $next->port . $next->name); + $this->next = $next; + } + + } +}