Commit e07a39d3 authored by Dominik Hebeler's avatar Dominik Hebeler
Browse files

Merge branch '42-bugs' into 'master'

42 bugs

Closes #42

See merge request !42
parents a8eb439c 2672c096
......@@ -599,7 +599,7 @@ class MetaGer
{
return $this->stopWords;
}
public function getHostCount(String $host)
public function getHostCount($host)
{
if(isset($this->addedHosts[$host]))
{
......@@ -609,7 +609,7 @@ class MetaGer
return 0;
}
}
public function addHostCount(String $host)
public function addHostCount($host)
{
$hash = md5($host);
if(isset($this->addedHosts[$hash]))
......@@ -624,7 +624,7 @@ class MetaGer
{
return $this->site;
}
public function addLink(String $link)
public function addLink($link)
{
$hash = md5($link);
if(isset($this->addedLinks[$hash]))
......@@ -638,7 +638,7 @@ class MetaGer
}
}
public function generateSearchLink(String $fokus)
public function generateSearchLink($fokus)
{
$requestData = $this->request->except('page');
$requestData['focus'] = $fokus;
......@@ -654,7 +654,7 @@ class MetaGer
return $link;
}
public function generateSiteSearchLink(String $host)
public function generateSiteSearchLink($host)
{
$host = urlencode($host);
$requestData = $this->request->except('page');
......@@ -664,7 +664,7 @@ class MetaGer
return $link;
}
public function generateRemovedHostLink (String $host)
public function generateRemovedHostLink ($host)
{
$host = urlencode($host);
$requestData = $this->request->except('page');
......@@ -673,7 +673,7 @@ class MetaGer
return $link;
}
public function generateRemovedDomainLink (String $domain)
public function generateRemovedDomainLink ($domain)
{
$domain = urlencode($domain);
$requestData = $this->request->except('page');
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -193,7 +193,7 @@ class Result
}
}
private function getStrippedHost (String $link)
private function getStrippedHost ($link)
{
if(strpos($link, "http") !== 0)
$link = "http://" . $link;
......@@ -201,7 +201,7 @@ class Result
$link = preg_replace("/^www\./si", "", $link);
return $link;
}
private function getStrippedLink (String $link)
private function getStrippedLink ($link)
{
if(strpos($link, "http") !== 0)
$link = "http://" . $link;
......@@ -210,7 +210,7 @@ class Result
return $host . $path;
}
private function getStrippedDomain (String $link)
private function getStrippedDomain ($link)
{
if(preg_match("/([^\.]*\.[^\.]*)$/si", $link, $match))
{
......@@ -221,7 +221,7 @@ class Result
}
}
private function generateProxyLink (String $link)
private function generateProxyLink ($link)
{
if(!$link)
return "";
......
<?php
namespace App\Models;
class Result
{
function __construct ( \SimpleXMLElement $provider, $titel, $link, $anzeigeLink , $descr, $gefVon, $sourceRank, $partnershop = false )
{
$this->titel = strip_tags(trim($titel));
$this->link = trim($link);
$this->anzeigeLink = trim($anzeigeLink);
$this->descr = strip_tags(trim($descr));
$this->descr = preg_replace("/\n+/si", " ", $this->descr);
$this->gefVon = trim($gefVon);
$this->proxyLink = $this->generateProxyLink($this->link);
$this->sourceRank = $sourceRank;
if($this->sourceRank <= 0 || $this->sourceRank > 20)
$this->sourceRank = 20;
$this->sourceRank = 20 - $this->sourceRank;
if(isset($provider["engineBoost"]))
{
$this->engineBoost = $provider["engineBoost"];
}else
{
$this->engineBoost = 1;
}
$this->valid = true;
$this->host = @parse_url($link, PHP_URL_HOST);
$this->strippedHost = $this->getStrippedHost($this->anzeigeLink);
$this->strippedDomain = $this->getStrippedDomain($this->strippedHost);
$this->strippedLink = $this->getStrippedLink($this->anzeigeLink);
$this->rank = 0;
$this->partnershop = $partnershop;
#die($this->anzeigeLink . "\r\n" . $this->strippedHost);
}
public function rank (\App\MetaGer $metager)
{
$rank = 0;
$rank += ($this->sourceRank * 0.02);
#URL-Boost
$link = $this->anzeigeLink;
if(strpos($link, "http") !== 0)
{
$link = "http://" . $link;
}
$link = @parse_url($link, PHP_URL_HOST) . @parse_url($link, PHP_URL_PATH);
$tmpLi = $link;
$tmpEingabe = $metager->getQ();
$count = 0;
$tmpLink = "";
$regex = [
"/\s+/si",
"/http:/si",
"/https:/si",
"/www\./si",
"/\//si",
"/\./si",
"/-/si"
];
foreach($regex as $reg)
{
$link = preg_replace($regex, "", $link);
$tmpEingabe = preg_replace($regex, "", $tmpEingabe);
}
#die($tmpLi . "<br>" . $link . "<br>" . $tmpEingabe . "<br><br>");
foreach(str_split($tmpEingabe) as $char)
{
if(strpos(strtolower($tmpLink), strtolower($char)) >= 0)
{
$count++;
$tmpLink = str_replace(urlencode($char), "", $tmpLink);
}
if(strlen($this->descr) > 80 && strlen($link) > 0)
{
$rank += $count /((strlen($link)) * 60);
}
}
# Boost für Vorkommen der Suchwörter:
$maxRank = 0.1;
$tmpTitle = $this->titel;
$tmpDescription = $this->descr;
$isWithin = false;
$tmpRank = 0;
$tmpEingabe = $metager->getQ();
$tmpEingabe = preg_replace("/\b\w{1,3}\b/si", "", $tmpEingabe);
$tmpEingabe = preg_replace("/\s+/si", " ", $tmpEingabe);
#die($tmpEingabe);
foreach(explode(" ", trim($tmpEingabe)) as $el)
{
$el = preg_quote($el, "/");
if(preg_match("/\b$el\b/si", $tmpTitle))
{
$tmpRank += .7 * .6 * $maxRank;
}elseif (strpos($tmpTitle, $el) !== false) {
$tmpRank += .3 * .6 * $maxRank;
}
if(preg_match("/\b$el\b/si", $tmpDescription))
{
$tmpRank += .7 * .4 * $maxRank;
}elseif (strpos($tmpDescription, $el) !== false) {
$tmpRank += .3 * .4 * $maxRank;
}
}
$tmpRank /= sizeof(explode(" ", trim($tmpEingabe))) * 10;
$rank += $tmpRank;
if($this->engineBoost > 0)
{
$rank *= floatval($this->engineBoost);
}
$this->rank = $rank;
}
public function getRank ()
{
return $this->rank;
}
public function isValid (\App\MetaGer $metager)
{
# Zunächst die persönlich ( über URL-Parameter ) definierten Blacklists:
if(in_array($this->strippedHost, $metager->getUserHostBlacklist())
|| in_array($this->strippedDomain, $metager->getUserDomainBlacklist()))
return false;
# Jetzt unsere URL und Domain Blacklist
if($this->strippedHost !== "" && (in_array($this->strippedHost, $metager->getDomainBlacklist()) || in_array($this->strippedLink, $metager->getUrlBlacklist())))
{
return false;
}
$text = $this->titel . " " . $this->descr;
if($metager->getLang() !== "all")
{
$result = $metager->getLanguageDetect()->detect($text, 1);
$lang = "";
foreach($result as $key => $value)
{
$lang = $key;
}
if($lang !== "" && $lang !== $metager->getLang())
return false;
}
# Wir wenden die Stoppwortsuche an und schmeißen entsprechende Ergebnisse raus:
foreach($metager->getStopWords() as $stopWord)
{
if(stripos($text, $stopWord) !== false)
{
return false;
}
}
# Abschließend noch 2 Überprüfungen. Einmal den Host filter, der Sicherstellt, dass von jedem Host maximal 3 Links angezeigt werden
# und dann noch den Dublettefilter, der sicher stellt, dass wir nach Möglichkeit keinen Link doppelt in der Ergebnisliste haben
# Diese Überprüfung führen wir unter bestimmten Bedingungen nicht durch:
if($metager->getSite() === "" &&
strpos($this->strippedHost, "ncbi.nlm.nih.gov") === false &&
strpos($this->strippedHost, "twitter.com") === false &&
strpos($this->strippedHost, "www.ladenpreis.net") === false &&
strpos($this->strippedHost, "ncbi.nlm.nih.gov") === false &&
strpos($this->strippedHost, "www.onenewspage.com") === false)
{
$count = $metager->getHostCount($this->strippedHost);
if($count >= 3)
{
return false;
}
}
# Unabhängig davon unser Dublettenfilter:
if($metager->addLink($this->strippedLink))
{
$metager->addHostCount($this->strippedHost);
return true;
}else
{
return false;
}
}
<<<<<<< HEAD
private function getStrippedHost ($link)
=======
private function getStrippedHost (String $link)
>>>>>>> e060ccae7fd0ede5daca4f3bfa267bd1418fde7a
{
if(strpos($link, "http") !== 0)
$link = "http://" . $link;
$link = @parse_url($link, PHP_URL_HOST);
$link = preg_replace("/^www\./si", "", $link);
return $link;
}
<<<<<<< HEAD
private function getStrippedLink ($link)
=======
private function getStrippedLink (String $link)
>>>>>>> e060ccae7fd0ede5daca4f3bfa267bd1418fde7a
{
if(strpos($link, "http") !== 0)
$link = "http://" . $link;
$host = $this->strippedHost;
$path = @parse_url($link , PHP_URL_PATH);
return $host . $path;
}
<<<<<<< HEAD
private function getStrippedDomain ($link)
=======
private function getStrippedDomain (String $link)
>>>>>>> e060ccae7fd0ede5daca4f3bfa267bd1418fde7a
{
if(preg_match("/([^\.]*\.[^\.]*)$/si", $link, $match))
{
return $match[1];
}else
{
return $link;
}
}
<<<<<<< HEAD
private function generateProxyLink ($link)
=======
private function generateProxyLink (String $link)
>>>>>>> e060ccae7fd0ede5daca4f3bfa267bd1418fde7a
{
if(!$link)
return "";
$tmp = $link;
$tmp = preg_replace("/\r?\n$/s", "", $tmp);
$tmp = preg_replace("#^([\w+.-]+)://#s", "$1/", $tmp);
return "https://proxy.suma-ev.de/cgi-bin/nph-proxy.cgi/en/I0/" . $tmp;
}
}
\ No newline at end of file
......@@ -73,7 +73,7 @@ abstract class Searchengine
}
public abstract function loadResults(String $result);
public abstract function loadResults($result);
private function writeRequest ()
{
......@@ -168,7 +168,7 @@ abstract class Searchengine
return $fp;
}
private function setStatistic(String $key, float $val)
private function setStatistic($key, float $val)
{
$oldVal = floatval(Redis::hget($this->name, $key)) * $this->uses;
......@@ -177,7 +177,7 @@ abstract class Searchengine
$this->$key = $newVal;
}
public function disable(string $sumaFile, string $message)
public function disable($sumaFile, $message)
{
Log::info($message);
$xml = simplexml_load_file($sumaFile);
......@@ -185,7 +185,7 @@ abstract class Searchengine
$xml->saveXML($sumaFile);
}
public function enable(string $sumaFile, string $message)
public function enable($sumaFile, $message)
{
Log::info($message);
$xml = simplexml_load_file($sumaFile);
......
<?php
namespace App\Models;
use App\MetaGer;
use Log;
use Redis;
abstract class Searchengine
{
protected $ch; # Curl Handle zum erhalten der Ergebnisse
public $fp;
protected $getString = "";
protected $engine;
protected $counter = 0;
protected $socketNumber = null;
public $enabled = true;
public $results = [];
public $ads = [];
public $write_time = 0;
public $connection_time = 0;
function __construct(\SimpleXMLElement $engine, MetaGer $metager)
{
foreach($engine->attributes() as $key => $value){
$this->$key = $value->__toString();
}
$this->engine = $engine;
# Wir registrieren die Benutzung dieser Suchmaschine
$this->uses = intval(Redis::hget($this->name, "uses")) + 1;
Redis::hset($this->name, "uses", $this->uses);
# Eine Suchmaschine kann automatisch temporär deaktiviert werden, wenn es Verbindungsprobleme gab:
if(isset($this->disabled) && strtotime($this->disabled) <= time() )
{
# In diesem Fall ist der Timeout der Suchmaschine abgelaufen.
$this->enable($metager->getSumaFile(), "Die Suchmaschine " . $this->name . " wurde wieder eingeschaltet.");
}elseif (isset($this->disabled) && strtotime($this->disabled) > time())
{
$this->enabled = false;
return;
}
# User-Agent definieren:
if( isset($_SERVER['HTTP_USER_AGENT']))
{
$this->useragent = $_SERVER['HTTP_USER_AGENT'];
}else
{
$this->useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1";
}
$this->ip = $metager->getIp();
$this->gefVon = "<a href=\"" . $this->homepage . "\" target=\"_blank\">" . $this->displayName . "</a>";
$this->startTime = microtime();
$this->getString = $this->generateGetString($metager->getQ(), $metager->getUrl(), $metager->getLanguage(), $metager->getCategory());
$counter = 0;
# Wir benötigen einen verfügbaren Socket, über den wir kommunizieren können:
$time = microtime(true);
$this->fp = $this->getFreeSocket();
$this->setStatistic("connection_time", ((microtime(true)-$time) / 1000000));
if(!$this->fp)
{
$this->disable($metager->getSumaFile(), "Die Suchmaschine " . $this->name . " wurde für 1h deaktiviert, weil keine Verbindung aufgebaut werden konnte");
}else
{
$time = microtime(true);
$this->writeRequest();
$this->setStatistic("write_time", ((microtime(true)-$time) / 1000000));
}
}
<<<<<<< HEAD
public abstract function loadResults($result);
=======
public abstract function loadResults(String $result);
>>>>>>> e060ccae7fd0ede5daca4f3bfa267bd1418fde7a
private function writeRequest ()
{
$out = "GET " . $this->getString . " HTTP/1.1\r\n";
$out .= "Host: " . $this->host . "\r\n";
$out .= "User-Agent: " . $this->useragent . "\r\n";
$out .= "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n";
$out .= "Accept-Language: de,en-US;q=0.7,en;q=0.3\r\n";
$out .= "Accept-Encoding: gzip, deflate, br\r\n";
$out .= "Connection: keep-alive\r\n\r\n";
# Anfrage senden:
$sent = 0; $string = $out; $time = microtime(true);
while(true)
{
try{
$tmp = fwrite($this->fp, $string);
}catch(\ErrorException $e)
{
# Irgendwas ist mit unserem Socket passiert. Wir brauchen einen neuen:
fclose($this->fp);
Redis::del($this->name . "." . $this->socketNumber);
$this->fp = $this->getFreeSocket();
$sent = 0;
$string = $out;
continue;
}
if($tmp){
$sent += $tmp;
$string = substr($string, $tmp);
}else
abort(500, "Fehler beim schreiben.");
if(((microtime(true) - $time) / 1000000) >= 500)
{
abort(500, "Konnte die Request Daten nicht an: " . $this->name . " senden");
}
if($sent >= strlen($out))
break;
}
}
public function rank (\App\MetaGer $metager)
{
foreach($this->results as $result)
{
$result->rank($metager);
}
}
private function getFreeSocket()
{
# Je nach Auslastung des Servers ( gleichzeitige Abfragen ), kann es sein, dass wir mehrere Sockets benötigen um die Abfragen ohne Wartezeit beantworten zu können.
# pfsockopen öffnet dabei einen persistenten Socket, der also auch zwischen den verschiedenen php Prozessen geteilt werden kann.
# Wenn der Hostname mit einem bereits erstellten Socket übereinstimmt, wird die Verbindung also aufgegriffen und fortgeführt.
# Allerdings dürfen wir diesen nur verwenden, wenn er nicht bereits von einem anderen Prozess zur Kommunikation verwendet wird.
# Wenn dem so ist, probieren wir den nächsten Socket zu verwenden.
# Dies festzustellen ist komplizierter, als man sich das vorstellt. Folgendes System sollte funktionieren:
# 1. Stelle fest, ob dieser Socket neu erstellt wurde, oder ob ein existierender geöffnet wurde.
$counter = 0; $fp = null;
do
{
if( intval(Redis::exists($this->host . ".$counter")) === 0 )
{
Redis::set($this->host . ".$counter", 1);
Redis::expire($this->host . ".$counter", 5);
$this->socketNumber = $counter;
try
{
$fp = pfsockopen($this->getHost() . ":" . $this->port . "/$counter", $this->port, $errstr, $errno, 1);
}catch(\ErrorException $e)
{
break;
}
# Wir gucken, ob der Lesepuffer leer ist:
stream_set_blocking($fp, 0);
if(fgets($fp, BUFFER_LENGTH) !== false)
{
Log::error("Der Lesepuffer von: " . $this->name . " war nach dem Erstellen nicht leer. Musste den Socket neu starten.");
fclose($fp);
$fp = pfsockopen($this->getHost() . ":" . $this->port . "/$counter", $this->port, $errstr, $errno, 1);
}
header($this->name . ": " . $counter . "_" . $this->getHost());
break;
}
$counter++;
}while(true);
return $fp;
}
<<<<<<< HEAD
private function setStatistic($key, float $val)
=======
private function setStatistic(String $key, float $val)
>>>>>>> e060ccae7fd0ede5daca4f3bfa267bd1418fde7a
{
$oldVal = floatval(Redis::hget($this->name, $key)) * $this->uses;
$newVal = ($oldVal + max($val, 0)) / $this->uses;
Redis::hset($this->name, $key, $newVal);
$this->$key = $newVal;
}
<<<<<<< HEAD
public function disable($sumaFile, $message)
=======
public function disable(string $sumaFile, string $message)
>>>>>>> e060ccae7fd0ede5daca4f3bfa267bd1418fde7a
{
Log::info($message);
$xml = simplexml_load_file($sumaFile);
$xml->xpath("//sumas/suma[@name='" . $this->name . "']")['0']['disabled'] = date(DATE_RFC822, mktime(date("H")+1,date("i"), date("s"), date("m"), date("d"), date("Y")));
$xml->saveXML($sumaFile);
}
<<<<<<< HEAD
public function enable($sumaFile, $message)
=======
public function enable(string $sumaFile, string $message)
>>>>>>> e060ccae7fd0ede5daca4f3bfa267bd1418fde7a
{
Log::info($message);
$xml = simplexml_load_file($sumaFile);
unset($xml->xpath("//sumas/suma[@name='" . $this->name . "']")['0']['disabled']);
$xml->saveXML($sumaFile);
}
public function closeFp()
{
fclose($this->fp);
}
public function retrieveResults()
{
$time = microtime(true);
$headers = '';
$body = '';