From cebd0214dbc075a8047a139bb2fef23a19a676a2 Mon Sep 17 00:00:00 2001 From: Dominik Pfennig <dominik@suma-ev.de> Date: Mon, 13 Feb 2017 12:55:05 +0100 Subject: [PATCH] Weitere Anpassungen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Die regulären Ausdrücke passen aber leider immer noch nicht komplett. Die Problematik ist schwieriger als gedacht. --- app/Http/Controllers/ProxyController.php | 140 +++++++++++++++++++++-- 1 file changed, 130 insertions(+), 10 deletions(-) diff --git a/app/Http/Controllers/ProxyController.php b/app/Http/Controllers/ProxyController.php index cde1c2c..943c643 100644 --- a/app/Http/Controllers/ProxyController.php +++ b/app/Http/Controllers/ProxyController.php @@ -102,10 +102,12 @@ class ProxyController extends Controller { $targetUrl = str_replace("<<SLASH>>", "/", $url); $targetUrl = str_rot13(base64_decode($targetUrl)); + $this->password = $password; // Hash Value under which a possible cached file would've been stored $hash = md5($targetUrl); $result = []; $httpcode = 200; + if (!Cache::has($hash) || 1 == 1) { // Inits the Curl connection for being able to preload multiple URLs while using a keep-alive connection $this->initCurl(); @@ -126,16 +128,34 @@ class ProxyController extends Controller } extract(parse_url($targetUrl)); - $base = $scheme . "://" . $host; + $this->base = $scheme . "://" . $host; if (isset($path)) { - $base .= $path; + $this->base .= $path; } - - $result["data"] = $this->parseRelativeToAbsolute($result["data"], $base); if (isset($result["header"]["Content-Type"]) && stripos($result["header"]["Content-Type"], "text/html") !== false) { $result["data"] = $this->convertTargetAttributes($result["data"]); } - $result["data"] = $this->parseProxyLink($result["data"], $password, $request); + // We define the "tag" that encloses possible URLS that are needed to be parsed + // Every tag is seperated by a "|" and needs to be regexp escaped + $tagsToMatch = "href=|src=|action=|background="; + // We have to match all Links enclosed within Quotes + $result["data"] = preg_replace_callback("/(<[^>]+)($tagsToMatch)\s*([\"\'])((?!\\\\3).*?)(\\3.*?>)/si", "self::regRel2AbsQuotes", $result["data"]); + // Ommitting Quotes is valid too so we match all Links matching this here + $result["data"] = preg_replace_callback("/(<[^>]+?)($tagsToMatch)([^\"\'\s][^\s\"\/>]*?)(\s[^>]+?>|>)/si", "self::regRel2AbsNoQuotes", $result["data"]); + // srcsets can contain multiple URLs so we handle them here srcset= + $result["data"] = preg_replace_callback("/(<[^>]+)(srcset=)\s*([\"\'])((?!\\\\3).*?)(\\3.*?>)/s", "self::regRel2AbsSrcSet", $result["data"]); + + if (isset($result["header"]["Content-Type"]) && stripos($result["header"]["Content-Type"], "text/css") !== false) { + // You can define resources in your css files that will make the browser load that resources + // We need to Proxify them, too. + // Option one url(...) + $result["data"] = preg_replace_callback("/(url\(\s*)([^\)]*?)(\))/si", "self::regCssRel2Abs", $result["data"]); + } + + // Now we need replace all of the absolute Links + // We have to distinct whether the target of the Link is _blank|_top or not + + #$result["data"] = $this->parseProxyLink($result["data"], $password, $request); curl_close($this->ch); # We are gonna cache all files for 60 Minutes to reduce @@ -204,12 +224,103 @@ class ProxyController extends Controller return $string; } - private function parseRelativeToAbsolute($data, $base) + private function regRel2AbsNoQuotes($match){ + $top = false; + if(preg_match("/target=[\"\']{0,1}\s*(_blank|_top)/si", $match[0]) === 1){ + $top = true; + } + + $pre = $match[1] . $match[2]; + $post = $match[4]; + $link = $match[3]; + + $link = $this->parseRelativeToAbsolute($link); + + // We will Proxify this URL + $link = $this->proxifyUrl($link, $this->password, $top); + return $pre . $link . $post; + } + + private function regRel2AbsQuotes($match){ + $top = false; + if(preg_match("/target=[\"\']{0,1}\s*(_blank|_top)/si", $match[0]) === 1){ + $top = true; + } + + $pre = $match[1] . $match[2] . $match[3]; + $post = $match[5]; + $link = htmlspecialchars_decode($match[4]); + + $link = $this->parseRelativeToAbsolute($link); + + // We will Proxify this URL + $link = $this->proxifyUrl($link, $this->password, $top); + return $pre . $link . $post; + } + + private function regCssRel2Abs($match){ + $top = false; + $pre = $match[1]; + $post = $match[3]; + $link = htmlspecialchars_decode($match[2]); + + $link = $this->parseRelativeToAbsolute($link); + if(strpos($link, "data:") !== 0){ + # die($link); + } + // We will Proxify this URL + $link = $this->proxifyUrl($link, $this->password, $top); + + return $pre . $link . $post; + } + + private function regRel2AbsSrcSet($match){ + $top = false; + if(preg_match("/target=[\"\']{0,1}\s*(_blank|_top)/si", $match[0]) === 1){ + $top = true; + } + + $pre = $match[1] . $match[2] . $match[3]; + $post = $match[5]; + + $links = explode(",", $match[4]); + $result = $match[4]; + foreach($links as $link){ + preg_match_all("/[\S]+/", $link, $matches); + if(isset($matches[0]) && isset($matches[0][0])){ + // In the srcset the Link would be the first match + $rel = $matches[0][0]; + $absLink = $this->parseRelativeToAbsolute($rel, "", $rel, ""); + + // We will Proxify this URL + $absLink = $this->proxifyUrl($absLink, $this->password, $top); + + $result = str_replace($rel, $absLink, $result); + } + } + return $pre . $result . $post; + } + + private function parseRelativeToAbsolute($link) { - $result = $data; - $count = 1; + // When the Link is already absolute then do not convert anything + if(preg_match("/(?:(?!:\/\/).)+?:\/\//si", $link) === 1){ + return $link; + // If the Link starts with "//" that means it already is an absolute Link + // But it needs to get the current protocol added + }elseif (preg_match("/^\s*?\/\//si", $link) === 1) { + $scheme = parse_url($this->base)["scheme"] . "://"; + $abs = preg_replace("/^\s*?\/\//si", "$scheme", $link); + return $abs; + // The Link that is following here is not absolute. But it can be invalid: + }else{ + $absLink = $this->rel2abs($link, $this->base); + return $absLink; + } - # Convert every Link that starts with [ . | / ] but not with [ // ] + # Convert every Link that starts with [ . | / ] but not with [ // ] + + while (preg_match("/(href=|src=|url\(|action=|srcset=|@import |background=)(\s*[\"\']{0,1}\s*)((:?\.|\/[^\/])[^\"\'\s]+)([\"\'\s])/si", $result, $matches) === 1) { $absoluteLink = $this->rel2abs($matches[3], $base); $result = str_replace($matches[0], $matches[1] . $matches[2] . $absoluteLink . $matches[5], $result, $count); @@ -281,6 +392,8 @@ class ProxyController extends Controller { $this->ch = curl_init(); curl_setopt($this->ch, CURLOPT_USERAGENT, 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)'); + curl_setopt($this->ch, CURLOPT_SSL_VERIFYHOST, 0); + curl_setopt($this->ch, CURLOPT_SSL_VERIFYPEER, 0); curl_setopt($this->ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($this->ch, CURLOPT_CONNECTTIMEOUT, 5); curl_setopt($this->ch, CURLOPT_TIMEOUT, 5); @@ -289,10 +402,10 @@ class ProxyController extends Controller private function getUrlContent($url, $withCookies) { + $url = htmlspecialchars_decode($url); curl_setopt($this->ch, CURLOPT_URL, "$url"); $data = curl_exec($this->ch); - die(htmlspecialchars_decode($url)); $httpcode = intval(curl_getinfo($this->ch, CURLINFO_HTTP_CODE)); $header_size = curl_getinfo($this->ch, CURLINFO_HEADER_SIZE); @@ -342,6 +455,13 @@ class ProxyController extends Controller public function proxifyUrl($url, $password = null, $topLevel) { + // Only convert valid URLs + $url = trim($url); + if(strpos($url, "http") !== 0){ + return $url; + } + + if (!$password) { $password = urlencode(\Request::route('password')); } -- GitLab