From 671a7e8f2c28e5b851f50ac0dca4de69fe4e1aa1 Mon Sep 17 00:00:00 2001
From: Dominik Pfennig <dominik@suma-ev.de>
Date: Wed, 25 Apr 2018 13:43:43 +0200
Subject: [PATCH] Das Encoding wird nun automatisch entweder aus dem Response
 Header, oder aus den Meta Tags der Seite ausgelesen. Fallback ist "UTF-8" das
 sollte viele Umlautprobleme beheben.

---
 app/HtmlDocument.php                     | 31 ++++++++++++++++++++++--
 app/Http/Controllers/ProxyController.php |  7 ++++--
 2 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/app/HtmlDocument.php b/app/HtmlDocument.php
index 09dc186..c880572 100644
--- a/app/HtmlDocument.php
+++ b/app/HtmlDocument.php
@@ -9,10 +9,37 @@ class HtmlDocument extends Document
 
     private $htmlString;
 
-    public function __construct($password, $baseUrl, $htmlString, $encoding = "UTF-8")
+    public function __construct($password, $baseUrl, $htmlString, $encoding)
     {
         parent::__construct($password, $baseUrl);
-        $this->htmlString = mb_convert_encoding($htmlString, 'HTML-ENTITIES', $encoding);
+        $this->htmlString = $this->convertEncoding($htmlString, $encoding);
+    }
+
+    public function convertEncoding($htmlString, $encoding){
+        # If the site sets the document encoding in the "content-type" Header the $encoding variable is not null
+        if($encoding == null){
+            # Otherwise we will try to extract the correct encoding from the meta tag
+            # Let's create a new DOM
+            libxml_use_internal_errors(true);
+            $dom = new DomDocument();
+            $dom->loadHtml($htmlString);
+            
+            foreach($dom->getElementsByTagName("meta") as $meta){                
+                # If there is a Content-Type Meta Tag
+                if($meta->hasAttribute("http-equiv") && strtolower($meta->getAttribute("http-equiv")) === "content-type" && $meta->hasAttribute("content")){
+                    $contentType = $meta->getAttribute("content");
+                    $encoding = stripos($contentType, "charset=") !== false ? trim(substr($contentType, stripos($contentType, "charset=")+8)) : null;
+                    if($encoding !== null) break;
+                }
+                # If there is a Charset Meta Tag
+                if($meta->hasAttribute("charset")){
+                    $encoding = $meta->getAttribute("charset");
+                    break;
+                }
+            }
+            if($encoding === null) $encoding = "UTF-8"; # Default Fallback
+        }
+        return mb_convert_encoding($htmlString, 'HTML-ENTITIES', $encoding);
     }
 
     public function getResult()
diff --git a/app/Http/Controllers/ProxyController.php b/app/Http/Controllers/ProxyController.php
index 0803fd3..f3faf35 100644
--- a/app/Http/Controllers/ProxyController.php
+++ b/app/Http/Controllers/ProxyController.php
@@ -131,7 +131,7 @@ class ProxyController extends Controller
         $supportedContentTypes = [
             'text/html',
         ];
-
+        
         $targetUrl      = str_replace("<<SLASH>>", "/", $url);
         $targetUrl      = str_rot13(base64_decode($targetUrl));
         $this->password = $password;
@@ -159,11 +159,14 @@ class ProxyController extends Controller
 
                 # We will parse whether we have a parser for this document type.
                 # If not, we will not Proxy it:
+                $contentTypeHeader = $result["header"]["content-type"];
                 $contentType = strpos($result["header"]["content-type"], ";") !== false ? trim(substr($result["header"]["content-type"], 0, strpos($result["header"]["content-type"], ";"))) : trim($result["header"]["content-type"]);
+                $contentEncoding = stripos($contentTypeHeader, "charset=") !== false ? trim(substr($contentTypeHeader, stripos($contentTypeHeader, "charset=")+8)) : null; 
                 switch ($contentType) {
                     case 'text/html':
                         # It's a html Document
-                        $htmlDocument = new HtmlDocument($password, $targetUrl, $result["data"]);
+                        $contentEncoding = null;
+                        $htmlDocument = new HtmlDocument($password, $targetUrl, $result["data"], $contentEncoding);
                         $htmlDocument->proxifyContent();
                         $result["data"] = $htmlDocument->getResult();
                         break;
-- 
GitLab