Searchengine.php 13.8 KB
Newer Older
1 2
<?php

3
namespace App\Models;
4
use App\MetaGer;
5 6
use Log;
use Redis;
7

8
abstract class Searchengine
9 10
{

11
	protected $ch; 	# Curl Handle zum erhalten der Ergebnisse
12
	public $fp;
13 14 15 16
	protected $getString = "";
	protected $engine;
    protected $counter = 0;
    protected $socketNumber = null;
17
    public $enabled = true;
18
	public $results = [];
19 20 21
	public $ads = [];
	public $write_time = 0;
	public $connection_time = 0;
22
	public $loaded = false;
23

24
	function __construct(\SimpleXMLElement $engine, MetaGer $metager)
25 26 27 28
	{
		foreach($engine->attributes() as $key => $value){
			$this->$key = $value->__toString();
		}
29 30
		if( !isset($this->homepage) )
			$this->homepage = "https://metager.de";
31 32 33 34 35
		$this->engine = $engine;

		# Wir registrieren die Benutzung dieser Suchmaschine
		$this->uses = intval(Redis::hget($this->name, "uses")) + 1;
		Redis::hset($this->name, "uses", $this->uses);
36

37 38 39 40 41 42 43 44 45 46 47
		# Eine Suchmaschine kann automatisch temporär deaktiviert werden, wenn es Verbindungsprobleme gab:
        if(isset($this->disabled) && strtotime($this->disabled) <= time() )
        {
        	# In diesem Fall ist der Timeout der Suchmaschine abgelaufen.
        	$this->enable($metager->getSumaFile(), "Die Suchmaschine " . $this->name . " wurde wieder eingeschaltet.");
        }elseif (isset($this->disabled) && strtotime($this->disabled) > time()) 
        {
        	$this->enabled = false;
        	return;
        }

48 49 50 51 52 53 54 55
		# User-Agent definieren:
		if( isset($_SERVER['HTTP_USER_AGENT']))
		{
			$this->useragent = $_SERVER['HTTP_USER_AGENT'];
		}else
		{
			$this->useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1";
		}
56
		$this->ip = $metager->getIp();
57 58
		$this->gefVon = "<a href=\"" . $this->homepage . "\" target=\"_blank\">" . $this->displayName . "</a>";
		$this->startTime = microtime();
59 60
		
		$this->getString = $this->generateGetString($metager->getQ(), $metager->getUrl(), $metager->getLanguage(), $metager->getCategory());
61 62
		$counter = 0;
		# Wir benötigen einen verfügbaren Socket, über den wir kommunizieren können:
63
		$time = microtime(true);
64
		$this->fp = $this->getFreeSocket();
65 66 67
		
		$this->setStatistic("connection_time", ((microtime(true)-$time) / 1000000));
		if(!$this->fp)
68
		{
69 70 71 72 73 74 75 76 77 78
			$this->disable($metager->getSumaFile(), "Die Suchmaschine " . $this->name . " wurde für 1h deaktiviert, weil keine Verbindung aufgebaut werden konnte");
		}else
		{
			$time = microtime(true);
			$this->writeRequest();
			$this->setStatistic("write_time", ((microtime(true)-$time) / 1000000));
		}

	}

79
	public abstract function loadResults($result);
80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97

	private function writeRequest ()
	{
		$out = "GET " . $this->getString . " HTTP/1.1\r\n";
		$out .= "Host: " . $this->host . "\r\n";
		$out .= "User-Agent: " . $this->useragent . "\r\n";
		$out .= "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n";
		$out .= "Accept-Language: de,en-US;q=0.7,en;q=0.3\r\n";
		$out .= "Accept-Encoding: gzip, deflate, br\r\n";
		$out .= "Connection: keep-alive\r\n\r\n";

		# Anfrage senden:
		$sent = 0; $string = $out; $time = microtime(true);
		while(true)
		{	
			try{
				$tmp = fwrite($this->fp, $string);
			}catch(\ErrorException $e)
98
			{
99 100 101 102 103 104 105 106 107 108 109
				# Irgendwas ist mit unserem Socket passiert. Wir brauchen einen neuen:
				fclose($this->fp);
				Redis::del($this->name . "." . $this->socketNumber);
				$this->fp = $this->getFreeSocket();
				$sent = 0;
				$string = $out;
				continue;
			}
			if($tmp){
				$sent += $tmp;
				$string = substr($string, $tmp);
110
			}else
111 112 113
				abort(500, "Fehler beim schreiben.");

			if(((microtime(true) - $time) / 1000000) >= 500)
114
			{
115
				abort(500, "Konnte die Request Daten nicht an: " . $this->name . " senden");
116
			}
117 118 119 120

			if($sent >= strlen($out))
				break;
		}
121 122
	}

123 124 125 126 127 128 129
	public function rank (\App\MetaGer $metager)
	{
		foreach($this->results as $result)
		{
			$result->rank($metager);
		}
	}
130 131 132 133 134 135 136 137 138 139

	private function getFreeSocket()
	{
		# Je nach Auslastung des Servers ( gleichzeitige Abfragen ), kann es sein, dass wir mehrere Sockets benötigen um die Abfragen ohne Wartezeit beantworten zu können.
		# pfsockopen öffnet dabei einen persistenten Socket, der also auch zwischen den verschiedenen php Prozessen geteilt werden kann. 
		# Wenn der Hostname mit einem bereits erstellten Socket übereinstimmt, wird die Verbindung also aufgegriffen und fortgeführt.
		# Allerdings dürfen wir diesen nur verwenden, wenn er nicht bereits von einem anderen Prozess zur Kommunikation verwendet wird.
		# Wenn dem so ist, probieren wir den nächsten Socket zu verwenden.
		# Dies festzustellen ist komplizierter, als man sich das vorstellt. Folgendes System sollte funktionieren:
		# 1. Stelle fest, ob dieser Socket neu erstellt wurde, oder ob ein existierender geöffnet wurde.
140
		$counter = 0; $fp = null;
141 142
		do
		{
143 144
			
			if( intval(Redis::exists($this->host . ".$counter")) === 0 )              
145
			{
146 147
				Redis::set($this->host . ".$counter", 1);
				Redis::expire($this->host . ".$counter", 5);
148 149 150 151 152 153 154
				$this->socketNumber = $counter;

				try
				{
					$fp = pfsockopen($this->getHost() . ":" . $this->port . "/$counter", $this->port, $errstr, $errno, 1);
				}catch(\ErrorException $e)
				{
155
					break;
156
				}
157 158 159
				# Wir gucken, ob der Lesepuffer leer ist:
				stream_set_blocking($fp, 0);
				if(fgets($fp, BUFFER_LENGTH) !== false)
160
				{
161
					Log::error("Der Lesepuffer von: " . $this->name . " war nach dem Erstellen nicht leer. Musste den Socket neu starten.");
162
					fclose($fp);
163
					$fp = pfsockopen($this->getHost() . ":" . $this->port . "/$counter", $this->port, $errstr, $errno, 1);
164
				}
165
				break;
166
			}
167 168 169 170 171 172
			$counter++;
		}while(true);

		return $fp;
	}

173
	private function setStatistic($key, $val)
174
	{
175

176 177 178 179
		$oldVal = floatval(Redis::hget($this->name, $key)) * $this->uses;
		$newVal = ($oldVal + max($val, 0)) / $this->uses;
		Redis::hset($this->name, $key, $newVal);
		$this->$key = $newVal;
180 181
	}

182
	public function disable($sumaFile, $message)
183 184 185
	{
		Log::info($message);
		$xml = simplexml_load_file($sumaFile);
186 187 188 189
		$xml->xpath("//sumas/suma[@name='" . $this->name . "']")['0']['disabled'] = date(DATE_RFC822, mktime(date("H")+1,date("i"), date("s"), date("m"), date("d"), date("Y")));
		$xml->saveXML($sumaFile);
	}

190
	public function enable($sumaFile, $message)
191 192 193 194
	{
		Log::info($message);
		$xml = simplexml_load_file($sumaFile);
		unset($xml->xpath("//sumas/suma[@name='" . $this->name . "']")['0']['disabled']);
195 196 197 198 199 200 201 202 203 204
		$xml->saveXML($sumaFile);
	}

	public function closeFp()
	{
		fclose($this->fp);
	}

	public function retrieveResults()
	{
205
		$time = microtime(true);
206 207 208 209 210 211 212 213
		$headers = '';
		$body = '';
		$length = 0;
		if(!$this->fp)
		{
			return;
		}
		// get headers FIRST
214 215
		$c = 0;
		stream_set_blocking($this->fp, 0);
216 217 218 219 220 221
		do
		{
    		// use fgets() not fread(), fgets stops reading at first newline
   			// or buffer which ever one is reached first
    		$data = fgets($this->fp, BUFFER_LENGTH);
    		// a sincle CRLF indicates end of headers
222
    		if ($data === false || $data == CRLF || feof($this->fp) || ((microtime()-$time)/1000000) > 100 ) {
223 224 225 226 227
        		// break BEFORE OUTPUT
        		break;
    		}
    		if( sizeof(($tmp = explode(": ", $data))) === 2 )
    			$headers[trim($tmp[0])] = trim($tmp[1]);
228
    		$c++;
229 230 231
		}
		while (true);
		// end of headers
232 233
		if(sizeof($headers) > 1){
			$bodySize = 0;
234
			stream_set_blocking($this->fp, 1);
235 236 237 238 239 240 241 242 243 244 245 246 247 248
			if( isset($headers["Transfer-Encoding"]) && $headers["Transfer-Encoding"] === "chunked" )
			{
				$body = $this->readChunked();
				
			}elseif( isset($headers['Content-Length']) )
			{
				$length = trim($headers['Content-Length']);
				if(is_numeric($length) && $length >= 1)
					$body = $this->readBody($length);
				$bodySize = strlen($body);
			}else
			{
				die("Konnte nicht herausfinden, wie ich die Serverantwort von: " . $this->name . " auslesen soll. Header war: " . print_r($headers));
			}
249
			$this->loaded = true;
250 251
		}else
		{
252
			return;
253 254
		}

255 256
		Redis::del($this->host . "." . $this->socketNumber);
		$this->setStatistic("read_time", ((microtime(true)-$time) / 1000000));
257 258 259 260
		if( isset($headers["Content-Encoding"]) && $headers['Content-Encoding'] === "gzip")
		{
			$body = $this->gunzip($body);
		}
261 262 263 264
		#print_r($headers);
		#print($body);
		#print("\r\n". $bodySize);
		#exit;
265 266 267 268 269
		#die(print_r($headers));
		// $body and $headers should contain your stream data
		$this->loadResults($body);
		#print(print_r($headers, TRUE) . $body);
		#exit;
270 271
	}

272 273
	public function shutdown()
	{
274 275
		if( $this->fp )
			fclose($this->fp);
276 277 278
		Redis::del($this->host . "." . $this->socketNumber);
	}

279
	private function readBody($length)
280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
	{
		$theData = '';
        $done = false;
        stream_set_blocking($this->fp, 0);
        $startTime = time();
        $lastTime = $startTime;
        while (!feof($this->fp) && !$done && (($startTime + 1) > time()) && $length !== 0)
        {
            usleep(100);
            $theNewData = fgets($this->fp, BUFFER_LENGTH);
            $theData .= $theNewData;
            $length -= strlen($theNewData);
            $done = (trim($theNewData) === '0');

        }
        return $theData;
	}

298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401
	private function readChunked()
	{
		$body = '';
		// read from chunked stream
		// loop though the stream
		do
		{
	    	// NOTE: for chunked encoding to work properly make sure
	    	// there is NOTHING (besides newlines) before the first hexlength

	    	// get the line which has the length of this chunk (use fgets here)
	    	$line = fgets($this->fp, BUFFER_LENGTH);

	    	// if it's only a newline this normally means it's read
	    	// the total amount of data requested minus the newline
	    	// continue to next loop to make sure we're done
	    	if ($line == CRLF) {
	        	continue;
	    	}

	    	// the length of the block is sent in hex decode it then loop through
	    	// that much data get the length
	    	// NOTE: hexdec() ignores all non hexadecimal chars it finds
	    	$length = hexdec($line);

	    	if (!is_int($length)) {
	        	trigger_error('Most likely not chunked encoding', E_USER_ERROR);
	    	}

		    // zero is sent when at the end of the chunks
		    // or the end of the stream or error
		    if ($line === false || $length < 1 || feof($this->fp)) {
		    	if($length <= 0)
		            	fgets($this->fp, BUFFER_LENGTH);
		        // break out of the streams loop
		        break;
		    }

		    // loop though the chunk
		    do
		    {
		        // read $length amount of data
		        // (use fread here)
		        $data = fread($this->fp, $length);

		        // remove the amount received from the total length on the next loop
		        // it'll attempt to read that much less data
		        $length -= strlen($data);

		        // PRINT out directly
		        #print $data;
		        #flush();
		        // you could also save it directly to a file here

		        // store in string for later use
		        $body .= $data;

		        // zero or less or end of connection break
		        if ($length <= 0 || feof($this->fp))
		        {
		            // break out of the chunk loop
		            if($length <= 0)
		            	fgets($this->fp, BUFFER_LENGTH);
		            break;
		        }
		    }
		    while (true);
		    // end of chunk loop
		}
		while (true);
		// end of stream loop
		return $body;
	}

	private function gunzip($zipped) {
      $offset = 0;
      if (substr($zipped,0,2) == "\x1f\x8b")
         $offset = 2;
      if (substr($zipped,$offset,1) == "\x08")  
      {
      	try
      	{
         return gzinflate(substr($zipped, $offset + 8));
      	} catch (\Exception $e)
      	{
      		abort(500, "Fehler beim unzip des Ergebnisses von folgendem Anbieter: " . $this->name);
      	}
      }
      return "Unknown Format";
   }  

	protected function getHost()
	{
		$return = "";
		if( $this->port === "443" )
		{
			$return .= "tls://";
		}else
		{
			$return .= "tcp://";
		}
		$return .= $this->host;
		return $return;
	}
402

403 404 405 406 407
	public function getCurlInfo()
	{
		return curl_getinfo($this->ch);
	}

408 409 410 411 412
	public function getCurlErrors()
	{
		return curl_errno($this->ch);
	}

413 414 415
	public function addCurlHandle ($mh)
	{
		curl_multi_add_handle($mh, $this->ch);
416 417
	}

418 419 420 421 422
	public function removeCurlHandle ($mh)
	{
		curl_multi_remove_handle($mh, $this->ch);
	}

423
	private function generateGetString($query, $url, $language, $category)
424 425
	{
		$getString = "";
426

427
		# Skript:
428 429 430 431
		if(strlen($this->skript) > 0)
			$getString .= $this->skript;
		else
			$getString .= "/";
432
		# FormData:
433 434
		if(strlen($this->formData) > 0)
			$getString .= "?" . $this->formData;
435 436 437 438 439 440 441 442

		# Wir müssen noch einige Platzhalter in dem GET-String ersetzen:
		if( strpos($getString, "<<USERAGENT>>") ){
			$getString = str_replace("<<USERAGENT>>", $this->urlEncode($this->useragent), $getString);
		}

		if( strpos($getString, "<<QUERY>>") )
		{
443
			$getString = str_replace("<<QUERY>>", $this->urlEncode($query), $getString);
444 445 446 447
		}

		if( strpos($getString, "<<IP>>") )
		{
448
			$getString = str_replace("<<IP>>", $this->urlEncode($this->ip), $getString);
449 450 451 452
		}

		if( strpos($getString, "<<LANGUAGE>>") )
		{
453
			$getString = str_replace("<<LANGUAGE>>", $this->urlEncode($language), $getString);
454 455 456 457
		}

		if( strpos($getString, "<<CATEGORY>>") )
		{
458
			$getString = str_replace("<<CATEGORY>>", $this->urlEncode($category), $getString);
459
		}
460 461 462

		if( strpos($getString, "<<AFFILDATA>>") )
		{
463
			$getString = str_replace("<<AFFILDATA>>", $this->getOvertureAffilData($url), $getString);
464
		}
465 466 467
		return $getString;
	}

468
	protected function urlEncode($string)
469 470 471 472 473 474 475 476 477 478
	{
		if(isset($this->inputEncoding))
		{
			return urlencode(mb_convert_encoding($string, $this->inputEncoding));
		}else
		{
			return urlencode($string);
		}
	}

479
	private function getOvertureAffilData($url)
480
	{
481
	    $affil_data = 'ip=' . $this->ip;
482 483 484
	    $affil_data .= '&ua=' . $this->useragent;  
	    if ( isset($_SERVER['HTTP_X_FORWARDED_FOR']) ) {
	       $affil_data .= '&xfip=' . $_SERVER['HTTP_X_FORWARDED_FOR'];
485
	    }
486
	    $affilDataValue = $this->urlEncode($affil_data);
487
		# Wir benötigen die ServeUrl:
488
		$serveUrl = $this->urlEncode($url);
489

490 491
		return "&affilData=" . $affilDataValue . "&serveUrl=" . $serveUrl;
	}
492 493 494 495 496

	public function isEnabled ()
	{
		return $this->enabled;
	}
497
}