From 671a7e8f2c28e5b851f50ac0dca4de69fe4e1aa1 Mon Sep 17 00:00:00 2001 From: Dominik Pfennig <dominik@suma-ev.de> Date: Wed, 25 Apr 2018 13:43:43 +0200 Subject: [PATCH] Das Encoding wird nun automatisch entweder aus dem Response Header, oder aus den Meta Tags der Seite ausgelesen. Fallback ist "UTF-8" das sollte viele Umlautprobleme beheben. --- app/HtmlDocument.php | 31 ++++++++++++++++++++++-- app/Http/Controllers/ProxyController.php | 7 ++++-- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/app/HtmlDocument.php b/app/HtmlDocument.php index 09dc186..c880572 100644 --- a/app/HtmlDocument.php +++ b/app/HtmlDocument.php @@ -9,10 +9,37 @@ class HtmlDocument extends Document private $htmlString; - public function __construct($password, $baseUrl, $htmlString, $encoding = "UTF-8") + public function __construct($password, $baseUrl, $htmlString, $encoding) { parent::__construct($password, $baseUrl); - $this->htmlString = mb_convert_encoding($htmlString, 'HTML-ENTITIES', $encoding); + $this->htmlString = $this->convertEncoding($htmlString, $encoding); + } + + public function convertEncoding($htmlString, $encoding){ + # If the site sets the document encoding in the "content-type" Header the $encoding variable is not null + if($encoding == null){ + # Otherwise we will try to extract the correct encoding from the meta tag + # Let's create a new DOM + libxml_use_internal_errors(true); + $dom = new DomDocument(); + $dom->loadHtml($htmlString); + + foreach($dom->getElementsByTagName("meta") as $meta){ + # If there is a Content-Type Meta Tag + if($meta->hasAttribute("http-equiv") && strtolower($meta->getAttribute("http-equiv")) === "content-type" && $meta->hasAttribute("content")){ + $contentType = $meta->getAttribute("content"); + $encoding = stripos($contentType, "charset=") !== false ? trim(substr($contentType, stripos($contentType, "charset=")+8)) : null; + if($encoding !== null) break; + } + # If there is a Charset Meta Tag + if($meta->hasAttribute("charset")){ + $encoding = $meta->getAttribute("charset"); + break; + } + } + if($encoding === null) $encoding = "UTF-8"; # Default Fallback + } + return mb_convert_encoding($htmlString, 'HTML-ENTITIES', $encoding); } public function getResult() diff --git a/app/Http/Controllers/ProxyController.php b/app/Http/Controllers/ProxyController.php index 0803fd3..f3faf35 100644 --- a/app/Http/Controllers/ProxyController.php +++ b/app/Http/Controllers/ProxyController.php @@ -131,7 +131,7 @@ class ProxyController extends Controller $supportedContentTypes = [ 'text/html', ]; - + $targetUrl = str_replace("<<SLASH>>", "/", $url); $targetUrl = str_rot13(base64_decode($targetUrl)); $this->password = $password; @@ -159,11 +159,14 @@ class ProxyController extends Controller # We will parse whether we have a parser for this document type. # If not, we will not Proxy it: + $contentTypeHeader = $result["header"]["content-type"]; $contentType = strpos($result["header"]["content-type"], ";") !== false ? trim(substr($result["header"]["content-type"], 0, strpos($result["header"]["content-type"], ";"))) : trim($result["header"]["content-type"]); + $contentEncoding = stripos($contentTypeHeader, "charset=") !== false ? trim(substr($contentTypeHeader, stripos($contentTypeHeader, "charset=")+8)) : null; switch ($contentType) { case 'text/html': # It's a html Document - $htmlDocument = new HtmlDocument($password, $targetUrl, $result["data"]); + $contentEncoding = null; + $htmlDocument = new HtmlDocument($password, $targetUrl, $result["data"], $contentEncoding); $htmlDocument->proxifyContent(); $result["data"] = $htmlDocument->getResult(); break; -- GitLab