Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
open-source
MetaGer
Commits
a4ce133c
Commit
a4ce133c
authored
Sep 17, 2018
by
Dominik Hebeler
Browse files
Scopia eingebaut
parent
c0893c01
Changes
4
Show whitespace changes
Inline
Side-by-side
app/Jobs/Searcher.php
View file @
a4ce133c
...
@@ -3,18 +3,16 @@
...
@@ -3,18 +3,16 @@
namespace
App\Jobs
;
namespace
App\Jobs
;
use
Illuminate\Bus\Queueable
;
use
Illuminate\Bus\Queueable
;
use
Illuminate\Queue\SerializesModels
;
use
Illuminate\Queue\InteractsWithQueue
;
use
Illuminate\Contracts\Queue\ShouldQueue
;
use
Illuminate\Contracts\Queue\ShouldQueue
;
use
Illuminate\Foundation\Bus\DispatchesJobs
;
use
Illuminate\Queue\InteractsWithQueue
;
use
Illuminate\Queue\SerializesModels
;
use
Illuminate\Support\Facades\Redis
;
use
Illuminate\Support\Facades\Redis
;
use
Log
;
class
Searcher
implements
ShouldQueue
class
Searcher
implements
ShouldQueue
{
{
use
InteractsWithQueue
,
Queueable
,
SerializesModels
;
use
InteractsWithQueue
,
Queueable
,
SerializesModels
;
protected
$name
,
$ch
,
$pid
,
$counter
,
$lastTime
,
$connectionInfo
;
protected
$name
,
$ch
,
$pid
,
$counter
,
$lastTime
,
$connectionInfo
,
$user
,
$password
;
# Each Searcher will shutdown after a specified time(s) or number of requests
# Each Searcher will shutdown after a specified time(s) or number of requests
protected
$MAX_REQUESTS
=
100
;
protected
$MAX_REQUESTS
=
100
;
# This value should always be below the retry_after value in config/queue.php
# This value should always be below the retry_after value in config/queue.php
...
@@ -34,12 +32,14 @@ class Searcher implements ShouldQueue
...
@@ -34,12 +32,14 @@ class Searcher implements ShouldQueue
* keep-alive requests.
* keep-alive requests.
* @return void
* @return void
*/
*/
public
function
__construct
(
$name
)
public
function
__construct
(
$name
,
$user
=
null
,
$password
=
null
)
{
{
$this
->
name
=
$name
;
$this
->
name
=
$name
;
$this
->
pid
=
getmypid
();
$this
->
pid
=
getmypid
();
$this
->
recheck
=
false
;
$this
->
recheck
=
false
;
$this
->
startTime
=
microtime
(
true
);
$this
->
startTime
=
microtime
(
true
);
$this
->
user
=
$user
;
$this
->
password
=
$password
;
// Submit this worker to the Redis System
// Submit this worker to the Redis System
Redis
::
expire
(
$this
->
name
,
5
);
Redis
::
expire
(
$this
->
name
,
5
);
}
}
...
@@ -53,9 +53,10 @@ class Searcher implements ShouldQueue
...
@@ -53,9 +53,10 @@ class Searcher implements ShouldQueue
{
{
// This Searches is freshly called so we need to initialize the curl handle $ch
// This Searches is freshly called so we need to initialize the curl handle $ch
$this
->
ch
=
$this
->
initCurlHandle
();
$this
->
ch
=
$this
->
initCurlHandle
();
try
{
$this
->
counter
=
0
;
// Counts the number of answered jobs
$this
->
counter
=
0
;
// Counts the number of answered jobs
$time
=
microtime
(
true
);
$time
=
microtime
(
true
);
while
(
true
){
while
(
true
)
{
// Update the expire
// Update the expire
Redis
::
expire
(
$this
->
name
,
5
);
Redis
::
expire
(
$this
->
name
,
5
);
Redis
::
expire
(
$this
->
name
.
".stats"
,
5
);
Redis
::
expire
(
$this
->
name
.
".stats"
,
5
);
...
@@ -69,7 +70,7 @@ class Searcher implements ShouldQueue
...
@@ -69,7 +70,7 @@ class Searcher implements ShouldQueue
$this
->
updateStats
(
microtime
(
true
)
-
$time
);
$this
->
updateStats
(
microtime
(
true
)
-
$time
);
$this
->
switchToRunning
();
$this
->
switchToRunning
();
// The mission can be empty when blpop hit the timeout
// The mission can be empty when blpop hit the timeout
if
(
!
empty
(
$mission
)){
if
(
!
empty
(
$mission
))
{
$mission
=
$mission
[
1
];
$mission
=
$mission
[
1
];
$poptime
=
microtime
(
true
)
-
$time
;
$poptime
=
microtime
(
true
)
-
$time
;
...
@@ -97,17 +98,20 @@ class Searcher implements ShouldQueue
...
@@ -97,17 +98,20 @@ class Searcher implements ShouldQueue
// In sync mode every Searcher may only retrieve one result because it would block
// In sync mode every Searcher may only retrieve one result because it would block
// the execution of the remaining code otherwise:
// the execution of the remaining code otherwise:
if
(
getenv
(
"QUEUE_DRIVER"
)
===
"sync"
if
(
getenv
(
"QUEUE_DRIVER"
)
===
"sync"
||
$this
->
counter
>
$this
->
MAX_REQUESTS
||
$this
->
counter
>
$this
->
MAX_REQUESTS
||
(
microtime
(
true
)
-
$this
->
startTime
)
>
$this
->
MAX_TIME
){
||
(
microtime
(
true
)
-
$this
->
startTime
)
>
$this
->
MAX_TIME
)
{
break
;
break
;
}
}
}
}
}
finally
{
// When we reach this point, time has come for this Searcher to retire
// When we reach this point, time has come for this Searcher to retire
$this
->
shutdown
();
$this
->
shutdown
();
}
}
}
private
function
switchToRunning
(){
private
function
switchToRunning
()
{
/**
/**
* When a Searcher is initially started the redis value for $this->name is set to "locked"
* When a Searcher is initially started the redis value for $this->name is set to "locked"
* which effectively will prevent new Searchers of this type to be started. (Value is checked by the MetaGer process which starts the Searchers)
* which effectively will prevent new Searchers of this type to be started. (Value is checked by the MetaGer process which starts the Searchers)
...
@@ -119,25 +123,27 @@ class Searcher implements ShouldQueue
...
@@ -119,25 +123,27 @@ class Searcher implements ShouldQueue
* When a search engine needs more time to produce search results than the timeout of the MetaGer process, we won't even bother of spawning
* When a search engine needs more time to produce search results than the timeout of the MetaGer process, we won't even bother of spawning
* more and more Searchers because they would just block free worker processes from serving the important engines which will give results in time.
* more and more Searchers because they would just block free worker processes from serving the important engines which will give results in time.
**/
**/
if
(
$this
->
counter
===
3
||
getenv
(
"QUEUE_DRIVER"
)
===
"sync"
){
if
(
$this
->
counter
===
3
||
getenv
(
"QUEUE_DRIVER"
)
===
"sync"
)
{
# If the MetaGer process waits longer for the results than this Fetcher will probably need to fetch
# If the MetaGer process waits longer for the results than this Fetcher will probably need to fetch
# Or if this engine is in the array of important engines which we will always try to serve
# Or if this engine is in the array of important engines which we will always try to serve
Redis
::
set
(
$this
->
name
,
"running"
);
Redis
::
set
(
$this
->
name
,
"running"
);
$this
->
recheck
=
false
;
$this
->
recheck
=
false
;
}
}
}
}
private
function
updateStats
(
$poptime
){
private
function
updateStats
(
$poptime
)
if
(
$this
->
connectionInfo
!==
NULL
){
{
if
(
$this
->
connectionInfo
!==
null
)
{
$connectionInfo
=
base64_encode
(
json_encode
(
$this
->
connectionInfo
));
$connectionInfo
=
base64_encode
(
json_encode
(
$this
->
connectionInfo
));
Redis
::
hset
(
$this
->
name
.
".stats"
,
$this
->
pid
,
$connectionInfo
.
";"
.
$poptime
);
Redis
::
hset
(
$this
->
name
.
".stats"
,
$this
->
pid
,
$connectionInfo
.
";"
.
$poptime
);
}
}
}
}
private
function
getFetchTime
(){
private
function
getFetchTime
()
{
$vals
=
Redis
::
hgetall
(
$this
->
name
.
".stats"
);
$vals
=
Redis
::
hgetall
(
$this
->
name
.
".stats"
);
if
(
sizeof
(
$vals
)
===
0
){
if
(
sizeof
(
$vals
)
===
0
)
{
return
0
;
return
0
;
}
else
{
}
else
{
$totalTime
=
0
;
$totalTime
=
0
;
foreach
(
$vals
as
$pid
=>
$value
)
{
foreach
(
$vals
as
$pid
=>
$value
)
{
$time
=
floatval
(
json_decode
(
base64_decode
(
explode
(
";"
,
$value
)[
0
]),
true
)[
"total_time"
]);
$time
=
floatval
(
json_decode
(
base64_decode
(
explode
(
";"
,
$value
)[
0
]),
true
)[
"total_time"
]);
...
@@ -149,7 +155,8 @@ class Searcher implements ShouldQueue
...
@@ -149,7 +155,8 @@ class Searcher implements ShouldQueue
}
}
}
}
private
function
retrieveUrl
(
$url
){
private
function
retrieveUrl
(
$url
)
{
// Set this URL to the Curl handle
// Set this URL to the Curl handle
curl_setopt
(
$this
->
ch
,
CURLOPT_URL
,
$url
);
curl_setopt
(
$this
->
ch
,
CURLOPT_URL
,
$url
);
$result
=
curl_exec
(
$this
->
ch
);
$result
=
curl_exec
(
$this
->
ch
);
...
@@ -157,36 +164,43 @@ class Searcher implements ShouldQueue
...
@@ -157,36 +164,43 @@ class Searcher implements ShouldQueue
return
$result
;
return
$result
;
}
}
private
function
storeResult
(
$result
,
$poptime
,
$hashValue
){
private
function
storeResult
(
$result
,
$poptime
,
$hashValue
)
{
Redis
::
hset
(
'search.'
.
$hashValue
,
$this
->
name
,
$result
);
Redis
::
hset
(
'search.'
.
$hashValue
,
$this
->
name
,
$result
);
// After 60 seconds the results should be read by the MetaGer Process and stored in the Cache instead
// After 60 seconds the results should be read by the MetaGer Process and stored in the Cache instead
Redis
::
expire
(
'search.'
.
$hashValue
,
60
);
Redis
::
expire
(
'search.'
.
$hashValue
,
60
);
$this
->
lastTime
=
microtime
(
true
);
$this
->
lastTime
=
microtime
(
true
);
}
}
private
function
shutdown
(){
private
function
shutdown
()
{
Redis
::
hdel
(
$this
->
name
.
".stats"
,
$this
->
pid
);
Redis
::
hdel
(
$this
->
name
.
".stats"
,
$this
->
pid
);
if
(
sizeof
(
Redis
::
hgetall
(
$this
->
name
.
".stats"
))
===
0
){
if
(
sizeof
(
Redis
::
hgetall
(
$this
->
name
.
".stats"
))
===
0
)
{
Redis
::
del
(
$this
->
name
);
Redis
::
del
(
$this
->
name
);
}
}
// We should close our curl handle before we do so
// We should close our curl handle before we do so
curl_close
(
$this
->
ch
);
curl_close
(
$this
->
ch
);
}
}
private
function
initCurlHandle
(){
private
function
initCurlHandle
()
{
$ch
=
curl_init
();
$ch
=
curl_init
();
curl_setopt_array
(
$ch
,
array
(
curl_setopt_array
(
$ch
,
array
(
CURLOPT_RETURNTRANSFER
=>
1
,
CURLOPT_RETURNTRANSFER
=>
1
,
CURLOPT_USERAGENT
=>
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1"
,
CURLOPT_USERAGENT
=>
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1"
,
CURLOPT_FOLLOWLOCATION
=>
TRUE
,
CURLOPT_FOLLOWLOCATION
=>
true
,
CURLOPT_CONNECTTIMEOUT
=>
10
,
CURLOPT_CONNECTTIMEOUT
=>
10
,
CURLOPT_MAXCONNECTS
=>
500
,
CURLOPT_MAXCONNECTS
=>
500
,
CURLOPT_LOW_SPEED_LIMIT
=>
500
,
CURLOPT_LOW_SPEED_LIMIT
=>
500
,
CURLOPT_LOW_SPEED_TIME
=>
5
,
CURLOPT_LOW_SPEED_TIME
=>
5
,
CURLOPT_TIMEOUT
=>
10
CURLOPT_TIMEOUT
=>
10
,
));
));
if
(
$this
->
user
!==
null
&&
$this
->
password
!==
null
)
{
curl_setopt
(
$ch
,
CURLOPT_USERPWD
,
$this
->
user
.
":"
.
$this
->
password
);
}
return
$ch
;
return
$ch
;
}
}
}
}
app/Models/Searchengine.php
View file @
a4ce133c
...
@@ -114,13 +114,13 @@ abstract class Searchengine
...
@@ -114,13 +114,13 @@ abstract class Searchengine
// With <ResultHash> being the Hash Value where the fetcher will store the result.
// With <ResultHash> being the Hash Value where the fetcher will store the result.
// and <URL to fetch> being the full URL to the searchengine
// and <URL to fetch> being the full URL to the searchengine
$url
=
""
;
$url
=
""
;
if
(
$this
->
port
===
"443"
){
if
(
$this
->
port
===
"443"
)
{
$url
=
"https://"
;
$url
=
"https://"
;
}
else
{
}
else
{
$url
=
"http://"
;
$url
=
"http://"
;
}
}
$url
.
=
$this
->
host
;
$url
.
=
$this
->
host
;
if
(
$this
->
port
!==
80
&&
$this
->
port
!==
443
){
if
(
$this
->
port
!==
80
&&
$this
->
port
!==
443
)
{
$url
.
=
":"
.
$this
->
port
;
$url
.
=
":"
.
$this
->
port
;
}
}
$url
.
=
$this
->
getString
;
$url
.
=
$this
->
getString
;
...
@@ -150,9 +150,9 @@ abstract class Searchengine
...
@@ -150,9 +150,9 @@ abstract class Searchengine
// Each searcher has one entry in it.
// Each searcher has one entry in it.
// So if it's empty, then we have currently no searcher running and
// So if it's empty, then we have currently no searcher running and
// of course need to spawn a new one.
// of course need to spawn a new one.
if
(
sizeof
(
$searcherData
)
===
0
){
if
(
sizeof
(
$searcherData
)
===
0
)
{
$needSearcher
=
true
;
$needSearcher
=
true
;
}
else
{
}
else
{
// There we go:
// There we go:
// There's at least one Fetcher running for this search engine.
// There's at least one Fetcher running for this search engine.
// Now we have to check if the current count is enough to fetch all the
// Now we have to check if the current count is enough to fetch all the
...
@@ -160,18 +160,18 @@ abstract class Searchengine
...
@@ -160,18 +160,18 @@ abstract class Searchengine
// Let's hardcode a minimum of 100ms between every search job.
// Let's hardcode a minimum of 100ms between every search job.
// First calculate the median of all Times
// First calculate the median of all Times
$median
=
0
;
$median
=
0
;
foreach
(
$searcherData
as
$pid
=>
$data
){
foreach
(
$searcherData
as
$pid
=>
$data
)
{
$data
=
explode
(
";"
,
$data
);
$data
=
explode
(
";"
,
$data
);
$median
+=
floatval
(
$data
[
1
]);
$median
+=
floatval
(
$data
[
1
]);
}
}
$median
/=
sizeof
(
$searcherData
);
$median
/=
sizeof
(
$searcherData
);
if
(
$median
<
.1
){
if
(
$median
<
.1
)
{
$needSearcher
=
true
;
$needSearcher
=
true
;
}
}
}
}
if
(
$needSearcher
&&
Redis
::
get
(
$this
->
name
)
!==
"locked"
){
if
(
$needSearcher
&&
Redis
::
get
(
$this
->
name
)
!==
"locked"
)
{
Redis
::
set
(
$this
->
name
,
"locked"
);
Redis
::
set
(
$this
->
name
,
"locked"
);
$this
->
dispatch
(
new
Searcher
(
$this
->
name
));
$this
->
dispatch
(
new
Searcher
(
$this
->
name
,
$this
->
user
,
$this
->
password
));
}
}
}
}
}
}
...
@@ -203,7 +203,8 @@ abstract class Searchengine
...
@@ -203,7 +203,8 @@ abstract class Searchengine
$this
->
enabled
=
true
;
$this
->
enabled
=
true
;
}
}
public
function
setResultHash
(
$hash
){
public
function
setResultHash
(
$hash
)
{
$this
->
resultHash
=
$hash
;
$this
->
resultHash
=
$hash
;
}
}
...
...
app/Models/parserSkripte/Fastbot.php
deleted
100644 → 0
View file @
c0893c01
<?php
namespace
app\Models\parserSkripte
;
use
App\Models\Searchengine
;
class
Fastbot
extends
Searchengine
{
public
$results
=
[];
public
function
__construct
(
\
SimpleXMLElement
$engine
,
\
App\MetaGer
$metager
)
{
parent
::
__construct
(
$engine
,
$metager
);
if
(
strpos
(
$this
->
urlEncode
(
$metager
->
getEingabe
()),
"%"
)
!==
false
)
{
$this
->
enabled
=
false
;
return
null
;
}
}
public
function
loadResults
(
$result
)
{
$result
=
utf8_encode
(
$result
);
$counter
=
0
;
foreach
(
explode
(
"
\n
"
,
$result
)
as
$line
)
{
$line
=
trim
(
$line
);
if
(
strlen
(
$line
)
>
0
)
{
# Hier bekommen wir jedes einzelne Ergebnis
$result
=
explode
(
"|:|"
,
$line
);
if
(
count
(
$result
)
<
4
)
continue
;
$link
=
$result
[
1
];
$link
=
substr
(
$link
,
strpos
(
$link
,
"href=
\"
"
)
+
6
);
$link
=
substr
(
$link
,
0
,
strpos
(
$link
,
"
\"
"
));
$counter
++
;
$this
->
gefVon
=
"<a href=
\"
"
.
$this
->
homepage
.
"
\"
target=
\"
_blank
\"
rel=
\"
noopener
\"
>"
.
$this
->
displayName
.
" "
.
trans
(
'results.redirect'
)
.
"</a>"
;
$this
->
results
[]
=
new
\
App\Models\Result
(
$this
->
engine
,
trim
(
strip_tags
(
$result
[
1
])),
$link
,
$result
[
3
],
$result
[
2
],
$this
->
displayName
,
$this
->
homepage
,
$counter
);
}
}
}
}
app/Models/parserSkripte/Scopia.php
0 → 100644
View file @
a4ce133c
<?php
namespace
app\Models\parserSkripte
;
use
App\Models\Searchengine
;
use
Log
;
class
Scopia
extends
Searchengine
{
public
$results
=
[];
public
function
__construct
(
\
SimpleXMLElement
$engine
,
\
App\MetaGer
$metager
)
{
parent
::
__construct
(
$engine
,
$metager
);
}
public
function
loadResults
(
$result
)
{
$result
=
html_entity_decode
(
$result
);
$result
=
str_replace
(
"&"
,
"&"
,
$result
);
try
{
$content
=
simplexml_load_string
(
$result
);
if
(
!
$content
)
{
return
;
}
$results
=
$content
->
xpath
(
'//results/result'
);
foreach
(
$results
as
$result
)
{
$title
=
$result
->
title
->
__toString
();
$link
=
$result
->
url
->
__toString
();
$anzeigeLink
=
$link
;
$descr
=
$result
->
description
->
__toString
();
$this
->
counter
++
;
$this
->
results
[]
=
new
\
App\Models\Result
(
$this
->
engine
,
$title
,
$link
,
$anzeigeLink
,
$descr
,
$this
->
gefVon
,
$this
->
counter
);
}
}
catch
(
\
Exception
$e
)
{
Log
::
error
(
"A problem occurred parsing results from
$this->name
:"
);
Log
::
error
(
$e
->
getMessage
());
return
;
}
}
public
function
getNext
(
\
App\MetaGer
$metager
,
$result
)
{
$result
=
html_entity_decode
(
$result
);
$result
=
str_replace
(
"&"
,
"&"
,
$result
);
try
{
$content
=
simplexml_load_string
(
$result
);
}
catch
(
\
Exception
$e
)
{
Log
::
error
(
"A problem occurred parsing results from
$this->name
:"
);
Log
::
error
(
$e
->
getMessage
());
return
;
}
if
(
!
$content
)
{
return
;
}
$more
=
$content
->
xpath
(
'//results/more'
)[
0
]
->
__toString
()
===
"1"
?
true
:
false
;
if
(
$more
)
{
$results
=
$content
->
xpath
(
'//results/result'
);
$number
=
$results
[
sizeof
(
$results
)
-
1
]
->
number
->
__toString
();
# Erstellen des neuen Suchmaschinenobjekts und anpassen des GetStrings:
$next
=
new
Scopia
(
simplexml_load_string
(
$this
->
engine
),
$metager
);
$next
->
getString
=
preg_replace
(
"/
\\
?s=.*?&/si"
,
"?s="
.
$number
,
$next
->
getString
);
$next
->
hash
=
md5
(
$next
->
host
.
$next
->
getString
.
$next
->
port
.
$next
->
name
);
$this
->
next
=
$next
;
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment