Searchengine.php 13.8 KB
Newer Older
1
2
<?php

3
namespace App\Models;
4
use App\MetaGer;
5
6
use Log;
use Redis;
7

8
abstract class Searchengine
9
10
{

11
	protected $ch; 	# Curl Handle zum erhalten der Ergebnisse
12
	public $fp;
13
14
15
16
	protected $getString = "";
	protected $engine;
    protected $counter = 0;
    protected $socketNumber = null;
17
    public $enabled = true;
18
	public $results = [];
19
20
21
	public $ads = [];
	public $write_time = 0;
	public $connection_time = 0;
22
	public $loaded = false;
23

24
	function __construct(\SimpleXMLElement $engine, MetaGer $metager)
25
26
27
28
	{
		foreach($engine->attributes() as $key => $value){
			$this->$key = $value->__toString();
		}
29
30
		if( !isset($this->homepage) )
			$this->homepage = "https://metager.de";
31
32
33
34
35
		$this->engine = $engine;

		# Wir registrieren die Benutzung dieser Suchmaschine
		$this->uses = intval(Redis::hget($this->name, "uses")) + 1;
		Redis::hset($this->name, "uses", $this->uses);
36

37
38
39
40
41
42
43
44
45
46
47
		# Eine Suchmaschine kann automatisch temporär deaktiviert werden, wenn es Verbindungsprobleme gab:
        if(isset($this->disabled) && strtotime($this->disabled) <= time() )
        {
        	# In diesem Fall ist der Timeout der Suchmaschine abgelaufen.
        	$this->enable($metager->getSumaFile(), "Die Suchmaschine " . $this->name . " wurde wieder eingeschaltet.");
        }elseif (isset($this->disabled) && strtotime($this->disabled) > time()) 
        {
        	$this->enabled = false;
        	return;
        }

48
49
50
51
52
53
54
55
		# User-Agent definieren:
		if( isset($_SERVER['HTTP_USER_AGENT']))
		{
			$this->useragent = $_SERVER['HTTP_USER_AGENT'];
		}else
		{
			$this->useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1";
		}
56
		$this->ip = $metager->getIp();
57
58
		$this->gefVon = "<a href=\"" . $this->homepage . "\" target=\"_blank\">" . $this->displayName . "</a>";
		$this->startTime = microtime();
59
60
		
		$this->getString = $this->generateGetString($metager->getQ(), $metager->getUrl(), $metager->getLanguage(), $metager->getCategory());
61
62
		$counter = 0;
		# Wir benötigen einen verfügbaren Socket, über den wir kommunizieren können:
63
		$time = microtime(true);
64
		$this->fp = $this->getFreeSocket();
65
66
67
		
		$this->setStatistic("connection_time", ((microtime(true)-$time) / 1000000));
		if(!$this->fp)
68
		{
69
70
71
72
73
74
75
76
77
78
			$this->disable($metager->getSumaFile(), "Die Suchmaschine " . $this->name . " wurde für 1h deaktiviert, weil keine Verbindung aufgebaut werden konnte");
		}else
		{
			$time = microtime(true);
			$this->writeRequest();
			$this->setStatistic("write_time", ((microtime(true)-$time) / 1000000));
		}

	}

79
	public abstract function loadResults($result);
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97

	private function writeRequest ()
	{
		$out = "GET " . $this->getString . " HTTP/1.1\r\n";
		$out .= "Host: " . $this->host . "\r\n";
		$out .= "User-Agent: " . $this->useragent . "\r\n";
		$out .= "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n";
		$out .= "Accept-Language: de,en-US;q=0.7,en;q=0.3\r\n";
		$out .= "Accept-Encoding: gzip, deflate, br\r\n";
		$out .= "Connection: keep-alive\r\n\r\n";

		# Anfrage senden:
		$sent = 0; $string = $out; $time = microtime(true);
		while(true)
		{	
			try{
				$tmp = fwrite($this->fp, $string);
			}catch(\ErrorException $e)
98
			{
99
100
101
102
103
104
105
106
107
108
109
				# Irgendwas ist mit unserem Socket passiert. Wir brauchen einen neuen:
				fclose($this->fp);
				Redis::del($this->name . "." . $this->socketNumber);
				$this->fp = $this->getFreeSocket();
				$sent = 0;
				$string = $out;
				continue;
			}
			if($tmp){
				$sent += $tmp;
				$string = substr($string, $tmp);
110
			}else
111
112
113
				abort(500, "Fehler beim schreiben.");

			if(((microtime(true) - $time) / 1000000) >= 500)
114
			{
115
				abort(500, "Konnte die Request Daten nicht an: " . $this->name . " senden");
116
			}
117
118
119
120

			if($sent >= strlen($out))
				break;
		}
121
122
	}

123
124
125
126
127
128
129
	public function rank (\App\MetaGer $metager)
	{
		foreach($this->results as $result)
		{
			$result->rank($metager);
		}
	}
130
131
132
133
134
135
136
137
138
139

	private function getFreeSocket()
	{
		# Je nach Auslastung des Servers ( gleichzeitige Abfragen ), kann es sein, dass wir mehrere Sockets benötigen um die Abfragen ohne Wartezeit beantworten zu können.
		# pfsockopen öffnet dabei einen persistenten Socket, der also auch zwischen den verschiedenen php Prozessen geteilt werden kann. 
		# Wenn der Hostname mit einem bereits erstellten Socket übereinstimmt, wird die Verbindung also aufgegriffen und fortgeführt.
		# Allerdings dürfen wir diesen nur verwenden, wenn er nicht bereits von einem anderen Prozess zur Kommunikation verwendet wird.
		# Wenn dem so ist, probieren wir den nächsten Socket zu verwenden.
		# Dies festzustellen ist komplizierter, als man sich das vorstellt. Folgendes System sollte funktionieren:
		# 1. Stelle fest, ob dieser Socket neu erstellt wurde, oder ob ein existierender geöffnet wurde.
140
		$counter = 0; $fp = null;
141
142
		do
		{
143
144
			
			if( intval(Redis::exists($this->host . ".$counter")) === 0 )              
145
			{
146
147
				Redis::set($this->host . ".$counter", 1);
				Redis::expire($this->host . ".$counter", 5);
148
149
150
151
152
153
154
				$this->socketNumber = $counter;

				try
				{
					$fp = pfsockopen($this->getHost() . ":" . $this->port . "/$counter", $this->port, $errstr, $errno, 1);
				}catch(\ErrorException $e)
				{
155
					break;
156
				}
157
158
159
				# Wir gucken, ob der Lesepuffer leer ist:
				stream_set_blocking($fp, 0);
				if(fgets($fp, BUFFER_LENGTH) !== false)
160
				{
161
					Log::error("Der Lesepuffer von: " . $this->name . " war nach dem Erstellen nicht leer. Musste den Socket neu starten.");
162
					fclose($fp);
163
					$fp = pfsockopen($this->getHost() . ":" . $this->port . "/$counter", $this->port, $errstr, $errno, 1);
164
				}
165
166
				header($this->name . ": " . $counter . "_" . $this->getHost());
				break;
167
			}
168
169
170
171
172
173
			$counter++;
		}while(true);

		return $fp;
	}

174
	private function setStatistic($key, $val)
175
	{
176

177
178
179
180
		$oldVal = floatval(Redis::hget($this->name, $key)) * $this->uses;
		$newVal = ($oldVal + max($val, 0)) / $this->uses;
		Redis::hset($this->name, $key, $newVal);
		$this->$key = $newVal;
181
182
	}

183
	public function disable($sumaFile, $message)
184
185
186
	{
		Log::info($message);
		$xml = simplexml_load_file($sumaFile);
187
188
189
190
		$xml->xpath("//sumas/suma[@name='" . $this->name . "']")['0']['disabled'] = date(DATE_RFC822, mktime(date("H")+1,date("i"), date("s"), date("m"), date("d"), date("Y")));
		$xml->saveXML($sumaFile);
	}

191
	public function enable($sumaFile, $message)
192
193
194
195
	{
		Log::info($message);
		$xml = simplexml_load_file($sumaFile);
		unset($xml->xpath("//sumas/suma[@name='" . $this->name . "']")['0']['disabled']);
196
197
198
199
200
201
202
203
204
205
		$xml->saveXML($sumaFile);
	}

	public function closeFp()
	{
		fclose($this->fp);
	}

	public function retrieveResults()
	{
206
		$time = microtime(true);
207
208
209
210
211
212
213
214
		$headers = '';
		$body = '';
		$length = 0;
		if(!$this->fp)
		{
			return;
		}
		// get headers FIRST
215
216
		$c = 0;
		stream_set_blocking($this->fp, 0);
217
218
219
220
221
222
		do
		{
    		// use fgets() not fread(), fgets stops reading at first newline
   			// or buffer which ever one is reached first
    		$data = fgets($this->fp, BUFFER_LENGTH);
    		// a sincle CRLF indicates end of headers
223
    		if ($data === false || $data == CRLF || feof($this->fp) || ((microtime()-$time)/1000000) > 100 ) {
224
225
226
227
228
        		// break BEFORE OUTPUT
        		break;
    		}
    		if( sizeof(($tmp = explode(": ", $data))) === 2 )
    			$headers[trim($tmp[0])] = trim($tmp[1]);
229
    		$c++;
230
231
232
		}
		while (true);
		// end of headers
233
234
		if(sizeof($headers) > 1){
			$bodySize = 0;
235
			stream_set_blocking($this->fp, 1);
236
237
238
239
240
241
242
243
244
245
246
247
248
249
			if( isset($headers["Transfer-Encoding"]) && $headers["Transfer-Encoding"] === "chunked" )
			{
				$body = $this->readChunked();
				
			}elseif( isset($headers['Content-Length']) )
			{
				$length = trim($headers['Content-Length']);
				if(is_numeric($length) && $length >= 1)
					$body = $this->readBody($length);
				$bodySize = strlen($body);
			}else
			{
				die("Konnte nicht herausfinden, wie ich die Serverantwort von: " . $this->name . " auslesen soll. Header war: " . print_r($headers));
			}
250
			$this->loaded = true;
251
252
		}else
		{
253
			return;
254
255
		}

256
257
		Redis::del($this->host . "." . $this->socketNumber);
		$this->setStatistic("read_time", ((microtime(true)-$time) / 1000000));
258
259
260
261
		if( isset($headers["Content-Encoding"]) && $headers['Content-Encoding'] === "gzip")
		{
			$body = $this->gunzip($body);
		}
262
263
264
265
		#print_r($headers);
		#print($body);
		#print("\r\n". $bodySize);
		#exit;
266
267
268
269
270
		#die(print_r($headers));
		// $body and $headers should contain your stream data
		$this->loadResults($body);
		#print(print_r($headers, TRUE) . $body);
		#exit;
271
272
	}

273
274
	public function shutdown()
	{
275
276
		if( $this->fp )
			fclose($this->fp);
277
278
279
		Redis::del($this->host . "." . $this->socketNumber);
	}

280
	private function readBody($length)
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
	{
		$theData = '';
        $done = false;
        stream_set_blocking($this->fp, 0);
        $startTime = time();
        $lastTime = $startTime;
        while (!feof($this->fp) && !$done && (($startTime + 1) > time()) && $length !== 0)
        {
            usleep(100);
            $theNewData = fgets($this->fp, BUFFER_LENGTH);
            $theData .= $theNewData;
            $length -= strlen($theNewData);
            $done = (trim($theNewData) === '0');

        }
        return $theData;
	}

299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
	private function readChunked()
	{
		$body = '';
		// read from chunked stream
		// loop though the stream
		do
		{
	    	// NOTE: for chunked encoding to work properly make sure
	    	// there is NOTHING (besides newlines) before the first hexlength

	    	// get the line which has the length of this chunk (use fgets here)
	    	$line = fgets($this->fp, BUFFER_LENGTH);

	    	// if it's only a newline this normally means it's read
	    	// the total amount of data requested minus the newline
	    	// continue to next loop to make sure we're done
	    	if ($line == CRLF) {
	        	continue;
	    	}

	    	// the length of the block is sent in hex decode it then loop through
	    	// that much data get the length
	    	// NOTE: hexdec() ignores all non hexadecimal chars it finds
	    	$length = hexdec($line);

	    	if (!is_int($length)) {
	        	trigger_error('Most likely not chunked encoding', E_USER_ERROR);
	    	}

		    // zero is sent when at the end of the chunks
		    // or the end of the stream or error
		    if ($line === false || $length < 1 || feof($this->fp)) {
		    	if($length <= 0)
		            	fgets($this->fp, BUFFER_LENGTH);
		        // break out of the streams loop
		        break;
		    }

		    // loop though the chunk
		    do
		    {
		        // read $length amount of data
		        // (use fread here)
		        $data = fread($this->fp, $length);

		        // remove the amount received from the total length on the next loop
		        // it'll attempt to read that much less data
		        $length -= strlen($data);

		        // PRINT out directly
		        #print $data;
		        #flush();
		        // you could also save it directly to a file here

		        // store in string for later use
		        $body .= $data;

		        // zero or less or end of connection break
		        if ($length <= 0 || feof($this->fp))
		        {
		            // break out of the chunk loop
		            if($length <= 0)
		            	fgets($this->fp, BUFFER_LENGTH);
		            break;
		        }
		    }
		    while (true);
		    // end of chunk loop
		}
		while (true);
		// end of stream loop
		return $body;
	}

	private function gunzip($zipped) {
      $offset = 0;
      if (substr($zipped,0,2) == "\x1f\x8b")
         $offset = 2;
      if (substr($zipped,$offset,1) == "\x08")  
      {
      	try
      	{
         return gzinflate(substr($zipped, $offset + 8));
      	} catch (\Exception $e)
      	{
      		abort(500, "Fehler beim unzip des Ergebnisses von folgendem Anbieter: " . $this->name);
      	}
      }
      return "Unknown Format";
   }  

	protected function getHost()
	{
		$return = "";
		if( $this->port === "443" )
		{
			$return .= "tls://";
		}else
		{
			$return .= "tcp://";
		}
		$return .= $this->host;
		return $return;
	}
403

404
405
406
407
408
	public function getCurlInfo()
	{
		return curl_getinfo($this->ch);
	}

409
410
411
412
413
	public function getCurlErrors()
	{
		return curl_errno($this->ch);
	}

414
415
416
	public function addCurlHandle ($mh)
	{
		curl_multi_add_handle($mh, $this->ch);
417
418
	}

419
420
421
422
423
	public function removeCurlHandle ($mh)
	{
		curl_multi_remove_handle($mh, $this->ch);
	}

424
	private function generateGetString($query, $url, $language, $category)
425
426
	{
		$getString = "";
427

428
		# Skript:
429
430
431
432
		if(strlen($this->skript) > 0)
			$getString .= $this->skript;
		else
			$getString .= "/";
433
		# FormData:
434
435
		if(strlen($this->formData) > 0)
			$getString .= "?" . $this->formData;
436
437
438
439
440
441
442
443

		# Wir müssen noch einige Platzhalter in dem GET-String ersetzen:
		if( strpos($getString, "<<USERAGENT>>") ){
			$getString = str_replace("<<USERAGENT>>", $this->urlEncode($this->useragent), $getString);
		}

		if( strpos($getString, "<<QUERY>>") )
		{
444
			$getString = str_replace("<<QUERY>>", $this->urlEncode($query), $getString);
445
446
447
448
		}

		if( strpos($getString, "<<IP>>") )
		{
449
			$getString = str_replace("<<IP>>", $this->urlEncode($this->ip), $getString);
450
451
452
453
		}

		if( strpos($getString, "<<LANGUAGE>>") )
		{
454
			$getString = str_replace("<<LANGUAGE>>", $this->urlEncode($language), $getString);
455
456
457
458
		}

		if( strpos($getString, "<<CATEGORY>>") )
		{
459
			$getString = str_replace("<<CATEGORY>>", $this->urlEncode($category), $getString);
460
		}
461
462
463

		if( strpos($getString, "<<AFFILDATA>>") )
		{
464
			$getString = str_replace("<<AFFILDATA>>", $this->getOvertureAffilData($url), $getString);
465
		}
466
467
468
		return $getString;
	}

469
	protected function urlEncode($string)
470
471
472
473
474
475
476
477
478
479
	{
		if(isset($this->inputEncoding))
		{
			return urlencode(mb_convert_encoding($string, $this->inputEncoding));
		}else
		{
			return urlencode($string);
		}
	}

480
	private function getOvertureAffilData($url)
481
	{
482
	    $affil_data = 'ip=' . $this->ip;
483
484
485
	    $affil_data .= '&ua=' . $this->useragent;  
	    if ( isset($_SERVER['HTTP_X_FORWARDED_FOR']) ) {
	       $affil_data .= '&xfip=' . $_SERVER['HTTP_X_FORWARDED_FOR'];
486
	    }
487
	    $affilDataValue = $this->urlEncode($affil_data);
488
		# Wir benötigen die ServeUrl:
489
		$serveUrl = $this->urlEncode($url);
490

491
492
		return "&affilData=" . $affilDataValue . "&serveUrl=" . $serveUrl;
	}
493
494
495
496
497

	public function isEnabled ()
	{
		return $this->enabled;
	}
498
}