Searchengine.php 13.2 KB
Newer Older
1
2
<?php

3
namespace App\Models;
4

5
use App\Jobs\Searcher;
6
use App\MetaGer;
7
8
use Cache;
use Illuminate\Foundation\Bus\DispatchesJobs;
9
use Illuminate\Support\Facades\Redis;
10

11
abstract class Searchengine
12
{
13
    use DispatchesJobs;
14

Karl's avatar
Karl committed
15
16
17
    public $ch; # Curl Handle zum erhalten der Ergebnisse
    public $getString = ""; # Der String für die Get-Anfrage
    public $engine; # Die ursprüngliche Engine XML
Dominik Hebeler's avatar
Dominik Hebeler committed
18
19
20
    public $enabled = true; # true, wenn die Suchmaschine nicht explizit disabled ist
    public $results = []; # Die geladenen Ergebnisse
    public $ads = []; # Die geladenen Werbungen
Dominik Hebeler's avatar
Dominik Hebeler committed
21
    public $products = []; # Die geladenen Produkte
Dominik Hebeler's avatar
Dominik Hebeler committed
22
23
    public $loaded = false; # wahr, sobald die Ergebnisse geladen wurden
    public $cached = false;
Karl's avatar
Karl committed
24
25
26
27
28
29
30
31
32
33

    public $ip; # Die IP aus der metager
    public $uses; # Die Anzahl der Nutzungen dieser Suchmaschine
    public $homepage; # Die Homepage dieser Suchmaschine
    public $name; # Der Name dieser Suchmaschine
    public $disabled; # Ob diese Suchmaschine ausgeschaltet ist
    public $useragent; # Der HTTP Useragent
    public $startTime; # Die Zeit der Erstellung dieser Suchmaschine
    public $hash; # Der Hash-Wert dieser Suchmaschine

34
35
36
    private $user; # Username für HTTP-Auth (falls angegeben)
    private $password; # Passwort für HTTP-Auth (falls angegeben)

Karl's avatar
Karl committed
37
    public $fp; # Wird für Artefakte benötigt
Dominik Hebeler's avatar
Dominik Hebeler committed
38
39
40
    public $socketNumber = null; # Wird für Artefakte benötigt
    public $counter = 0; # Wird eventuell für Artefakte benötigt
    public $write_time = 0; # Wird eventuell für Artefakte benötigt
Karl's avatar
Karl committed
41
    public $connection_time = 0; # Wird eventuell für Artefakte benötigt
42
43
44

    public function __construct(\SimpleXMLElement $engine, MetaGer $metager)
    {
Karl's avatar
Karl committed
45
        # Versucht möglichst viele attribute aus dem engine XML zu laden
46
47
48
        foreach ($engine->attributes() as $key => $value) {
            $this->$key = $value->__toString();
        }
Karl's avatar
Karl committed
49
50

        # Standardhomepage metager.de
51
52
53
54
        if (!isset($this->homepage)) {
            $this->homepage = "https://metager.de";
        }

Karl's avatar
Karl committed
55
        # Speichert die XML der Engine
56
        $this->engine = $engine->asXML();
57

Karl's avatar
Karl committed
58
        # Cache Standarddauer 60
59
60
61
62
        if (!isset($this->cacheDuration)) {
            $this->cacheDuration = 60;
        }

Karl's avatar
Karl committed
63
64
        $this->enabled = true;

65
66
67
68
69
70
71
72
73
        # Eine Suchmaschine kann automatisch temporär deaktiviert werden, wenn es Verbindungsprobleme gab:
        if (isset($this->disabled) && strtotime($this->disabled) <= time()) {
            # In diesem Fall ist der Timeout der Suchmaschine abgelaufen.
            $this->enable($metager->getSumaFile(), "Die Suchmaschine " . $this->name . " wurde wieder eingeschaltet.");
        } elseif (isset($this->disabled) && strtotime($this->disabled) > time()) {
            $this->enabled = false;
            return;
        }

Dominik Hebeler's avatar
Dominik Hebeler committed
74
        $this->useragent = $metager->getUserAgent();
Dominik Hebeler's avatar
Dominik Hebeler committed
75
        $this->ip = $metager->getIp();
76
77
        $this->startTime = microtime();

Karl's avatar
Karl committed
78
        # Suchstring generieren
79
80
81
82
83
84
85
86
87
88
89
        $q = "";
        if (isset($this->hasSiteSearch) && $this->hasSiteSearch === "1") {
            if (strlen($metager->getSite()) === 0) {
                $q = $metager->getQ();
            } else {
                $q = $metager->getQ() . " site:" . $metager->getSite();
            }

        } else {
            $q = $metager->getQ();
        }
Dominik Hebeler's avatar
Dominik Hebeler committed
90
91
        $this->getString = $this->generateGetString($q, $metager->getUrl(), $metager->getLanguage(), $metager->getCategory());
        $this->hash = md5($this->host . $this->getString . $this->port . $this->name);
92
        $this->resultHash = $metager->getHashCode();
Dominik Hebeler's avatar
Dominik Hebeler committed
93
        $this->canCache = $metager->canCache();
Phil Höfer's avatar
Phil Höfer committed
94
        if (!isset($this->additionalHeaders)) {$this->additionalHeaders = "";}
95
96
97
98
    }

    abstract public function loadResults($result);

99
    # Standardimplementierung der getNext Funktion, damit diese immer verwendet werden kann
100
101
    public function getNext(MetaGer $metager, $result)
    {
102
103
104

    }

Karl's avatar
Karl committed
105
    # Prüft, ob die Suche bereits gecached ist, ansonsted wird sie als Job dispatched
106
107
    public function startSearch(\App\MetaGer $metager)
    {
Dominik Hebeler's avatar
Dominik Hebeler committed
108
        if ($this->canCache && Cache::has($this->hash)) {
109
            $this->cached = true;
110
            $this->retrieveResults($metager);
111
        } else {
112
113
114
115
116
117
118
119
            // We will push the confirmation of the submission to the Result Hash
            Redis::hset('search.' . $this->resultHash, $this->name, "waiting");
            // We need to submit a action that one of our workers can understand
            // The missions are submitted to a redis queue in the following string format
            // <ResultHash>;<URL to fetch>
            // With <ResultHash> being the Hash Value where the fetcher will store the result.
            // and <URL to fetch> being the full URL to the searchengine
            $url = "";
Dominik Hebeler's avatar
Dominik Hebeler committed
120
            if ($this->port === "443") {
121
                $url = "https://";
Dominik Hebeler's avatar
Dominik Hebeler committed
122
            } else {
123
124
                $url = "http://";
            }
Dominik Hebeler's avatar
Dominik Hebeler committed
125
            $url .= $this->host;
Dominik Hebeler's avatar
Dominik Hebeler committed
126
            if ($this->port !== 80 && $this->port !== 443) {
Dominik Hebeler's avatar
Dominik Hebeler committed
127
128
129
                $url .= ":" . $this->port;
            }
            $url .= $this->getString;
Dominik Hebeler's avatar
Dominik Hebeler committed
130
131
            $url = base64_encode($url);
            $mission = $this->resultHash . ";" . $url . ";" . $metager->getTime();
132
133
134
135
136
            // Submit this mission to the corresponding Redis Queue
            // Since each Searcher is dedicated to one specific search engine
            // each Searcher has it's own queue lying under the redis key <name>.queue
            Redis::rpush($this->name . ".queue", $mission);

137
            /**
Dominik Hebeler's avatar
Dominik Hebeler committed
138
139
140
141
142
143
144
145
146
147
148
             * We have Searcher processes running for MetaGer
             * Each Searcher is dedicated to one specific Searchengine and fetches it's results.
             * We can have multiple Searchers for each engine, if needed.
             * At this point we need to decide, whether we need to start a new Searcher process or
             * if we have enough of them running.
             * The information for that is provided through the redis system. Each running searcher
             * gives information how long it has waited to be given the last fetcher job.
             * The longer this time value is, the less frequent the search engine is used and the less
             * searcher of that type we need.
             * But if it's too low, i.e. 100ms, then the searcher is near to it's full workload and needs assistence.
             **/
149
150
151
152
153
            $needSearcher = false;
            $searcherData = Redis::hgetall($this->name . ".stats");

            // We now have an array of statistical data from the searchers
            // Each searcher has one entry in it.
Dominik Hebeler's avatar
Dominik Hebeler committed
154
            // So if it's empty, then we have currently no searcher running and
155
            // of course need to spawn a new one.
Dominik Hebeler's avatar
Dominik Hebeler committed
156
            if (sizeof($searcherData) === 0) {
157
                $needSearcher = true;
Dominik Hebeler's avatar
Dominik Hebeler committed
158
            } else {
159
160
161
162
163
                // There we go:
                // There's at least one Fetcher running for this search engine.
                // Now we have to check if the current count is enough to fetch all the
                // searches or if it needs help.
                // Let's hardcode a minimum of 100ms between every search job.
Dominik Hebeler's avatar
Dominik Hebeler committed
164
165
                // First calculate the median of all Times
                $median = 0;
Dominik Hebeler's avatar
Dominik Hebeler committed
166
                foreach ($searcherData as $pid => $data) {
Dominik Hebeler's avatar
Dominik Hebeler committed
167
168
169
170
                    $data = explode(";", $data);
                    $median += floatval($data[1]);
                }
                $median /= sizeof($searcherData);
Dominik Hebeler's avatar
Dominik Hebeler committed
171
                if ($median < .1) {
Dominik Hebeler's avatar
Dominik Hebeler committed
172
173
                    $needSearcher = true;
                }
174
            }
Dominik Hebeler's avatar
Dominik Hebeler committed
175
            if ($needSearcher && Redis::get($this->name) !== "locked") {
Dominik Hebeler's avatar
Dominik Hebeler committed
176
                Redis::set($this->name, "locked");
Dominik Hebeler's avatar
Dominik Hebeler committed
177
                $this->dispatch(new Searcher($this->name, $this->user, $this->password));
178
            }
179
180
181
        }
    }

Karl's avatar
Karl committed
182
    # Ruft die Ranking-Funktion aller Ergebnisse auf.
183
    public function rank($eingabe)
184
185
    {
        foreach ($this->results as $result) {
186
            $result->rank($eingabe);
187
188
189
        }
    }

Karl's avatar
Karl committed
190
    # Magic ???
191
192
193
194
195
196
197
198
199
    private function setStatistic($key, $val)
    {

        $oldVal = floatval(Redis::hget($this->name, $key)) * $this->uses;
        $newVal = ($oldVal + max($val, 0)) / $this->uses;
        Redis::hset($this->name, $key, $newVal);
        $this->$key = $newVal;
    }

Karl's avatar
Karl committed
200
    # Entfernt wenn gesetzt das disabled="1" für diese Suchmaschine aus der sumas.xml
201
202
203
204
205
    public function enable($sumaFile, $message)
    {
        $xml = simplexml_load_file($sumaFile);
        unset($xml->xpath("//sumas/suma[@name='" . $this->name . "']")['0']['disabled']);
        $xml->saveXML($sumaFile);
Karl's avatar
Karl committed
206
        $this->enabled = true;
207
208
    }

Dominik Hebeler's avatar
Dominik Hebeler committed
209
210
    public function setResultHash($hash)
    {
Dominik Hebeler's avatar
Dominik Hebeler committed
211
212
213
        $this->resultHash = $hash;
    }

214
215
216
217
218
    public function closeFp()
    {
        fclose($this->fp);
    }

Karl's avatar
Karl committed
219
    # Öffnet einen neuen Socket für diese Engine
220
221
222
223
224
225
226
227
228
229
    public function getSocket()
    {
        $number = Redis::hget('search.' . $this->hash, $this->name);
        if ($number === null) {
            return null;
        } else {
            return pfsockopen($this->getHost() . ":" . $this->port . "/$number", $this->port, $errstr, $errno, 1);
        }
    }

Karl's avatar
Karl committed
230
    # Fragt die Ergebnisse von Redis ab und lädt Sie
231
    public function retrieveResults(MetaGer $metager)
232
233
234
    {
        if ($this->loaded) {
            return true;
235
236
        }

237
        $body = "";
Dominik Hebeler's avatar
Dominik Hebeler committed
238
        if ($this->canCache && $this->cacheDuration > 0 && Cache::has($this->hash)) {
239
240
241
            $body = Cache::get($this->hash);
        } elseif (Redis::hexists('search.' . $this->resultHash, $this->name)) {
            $body = Redis::hget('search.' . $this->resultHash, $this->name);
242
            Redis::hdel('search.' . $this->resultHash, $this->name);
Dominik Hebeler's avatar
Dominik Hebeler committed
243
            if ($this->canCache && $this->cacheDuration > 0) {
244
245
246
                Cache::put($this->hash, $body, $this->cacheDuration);
            }
        }
247
        if ($body !== "" && $body !== "connected" && $body !== "waiting") {
248
            $this->loadResults($body);
249
            $this->getNext($metager, $body);
250
251
252
253
254
255
256
257
258
259
260
261
            $this->loaded = true;
            return true;
        } else {
            return false;
        }
    }

    public function shutdown()
    {
        Redis::del($this->host . "." . $this->socketNumber);
    }

Karl's avatar
Karl committed
262
    # Erstellt den für die Get-Anfrage genutzten Host-Link
263
264
265
266
267
268
269
270
271
272
273
274
    protected function getHost()
    {
        $return = "";
        if ($this->port === "443") {
            $return .= "tls://";
        } else {
            $return .= "tcp://";
        }
        $return .= $this->host;
        return $return;
    }

Karl's avatar
Karl committed
275
    # Erstellt den für die Get-Anfrage genutzten String
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
    private function generateGetString($query, $url, $language, $category)
    {
        $getString = "";

        # Skript:
        if (strlen($this->skript) > 0) {
            $getString .= $this->skript;
        } else {
            $getString .= "/";
        }

        # FormData:
        if (strlen($this->formData) > 0) {
            $getString .= "?" . $this->formData;
        }

        # Wir müssen noch einige Platzhalter in dem GET-String ersetzen:
Karl's avatar
Karl committed
293
        # Useragent
294
295
296
297
        if (strpos($getString, "<<USERAGENT>>")) {
            $getString = str_replace("<<USERAGENT>>", $this->urlEncode($this->useragent), $getString);
        }

Karl's avatar
Karl committed
298
        # Query
299
300
301
302
        if (strpos($getString, "<<QUERY>>")) {
            $getString = str_replace("<<QUERY>>", $this->urlEncode($query), $getString);
        }

Karl's avatar
Karl committed
303
        # IP
304
305
306
307
        if (strpos($getString, "<<IP>>")) {
            $getString = str_replace("<<IP>>", $this->urlEncode($this->ip), $getString);
        }

Karl's avatar
Karl committed
308
        # Language
309
310
311
312
        if (strpos($getString, "<<LANGUAGE>>")) {
            $getString = str_replace("<<LANGUAGE>>", $this->urlEncode($language), $getString);
        }

Karl's avatar
Karl committed
313
        # Category
314
315
316
317
        if (strpos($getString, "<<CATEGORY>>")) {
            $getString = str_replace("<<CATEGORY>>", $this->urlEncode($category), $getString);
        }

Karl's avatar
Karl committed
318
        # Affildata
319
320
321
322
323
324
        if (strpos($getString, "<<AFFILDATA>>")) {
            $getString = str_replace("<<AFFILDATA>>", $this->getOvertureAffilData($url), $getString);
        }
        return $getString;
    }

Karl's avatar
Karl committed
325
    # Wandelt einen String nach aktuell gesetztem inputEncoding dieser Searchengine in URL-Format um
326
327
328
329
330
331
332
333
334
    protected function urlEncode($string)
    {
        if (isset($this->inputEncoding)) {
            return urlencode(mb_convert_encoding($string, $this->inputEncoding));
        } else {
            return urlencode($string);
        }
    }

Karl's avatar
Karl committed
335
    # Liefert Sonderdaten für Yahoo
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
    private function getOvertureAffilData($url)
    {
        $affil_data = 'ip=' . $this->ip;
        $affil_data .= '&ua=' . $this->useragent;
        $affilDataValue = $this->urlEncode($affil_data);
        # Wir benötigen die ServeUrl:
        $serveUrl = $this->urlEncode($url);

        return "&affilData=" . $affilDataValue . "&serveUrl=" . $serveUrl;
    }

    public function isEnabled()
    {
        return $this->enabled;
    }
Karl's avatar
Karl committed
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372

    # Artefaktmethoden

    public function getCurlInfo()
    {
        return curl_getinfo($this->ch);
    }

    public function getCurlErrors()
    {
        return curl_errno($this->ch);
    }

    public function addCurlHandle($mh)
    {
        curl_multi_add_handle($mh, $this->ch);
    }

    public function removeCurlHandle($mh)
    {
        curl_multi_remove_handle($mh, $this->ch);
    }
373
}