<?php namespace App; use DomDocument; class HtmlDocument extends Document { private $htmlString; private $encoding; public function __construct($password, $baseUrl, $htmlString, $encoding = null) { parent::__construct($password, $baseUrl); $this->htmlString = $this->convertEncoding($htmlString, $encoding); } public function convertEncoding($htmlString, $encoding) { # If the site sets the document encoding in the "content-type" Header the $encoding variable is not null if ($encoding == null && !empty($htmlString)) { # Otherwise we will try to extract the correct encoding from the meta tag # Let's create a new DOM libxml_use_internal_errors(true); $dom = new DomDocument(); $dom->loadHtml($htmlString); foreach ($dom->getElementsByTagName("meta") as $meta) { # If there is a Content-Type Meta Tag if ($meta->hasAttribute("http-equiv") && strtolower($meta->getAttribute("http-equiv")) === "content-type" && $meta->hasAttribute("content")) { $contentType = $meta->getAttribute("content"); $encoding = stripos($contentType, "charset=") !== false ? trim(substr($contentType, stripos($contentType, "charset=") + 8)) : null; $encoding = rtrim($encoding, ";"); if ($encoding !== null) { break; } } # If there is a Charset Meta Tag if ($meta->hasAttribute("charset")) { $encoding = $meta->getAttribute("charset"); break; } } if ($encoding === null) { $encoding = "UTF-8"; } # Default Fallback } if (!empty($encoding)) { $this->encoding = $encoding; return mb_convert_encoding($htmlString, 'HTML-ENTITIES', $encoding); } else { return mb_convert_encoding($htmlString, 'HTML-ENTITIES'); } } public function getResult() { return $this->htmlString; } public function getEncoding() { return $this->encoding; } /** * Function proxifyContent * This method parses the given String and Proxifies all Links/Urls in it so it's targetting this Proxy Server **/ public function proxifyContent() { if (trim($this->htmlString) === "") { return; } # Let's create a new DOM libxml_use_internal_errors(true); $dom = new DomDocument(); $dom->loadHtml($this->htmlString); foreach ($dom->getElementsByTagName('base') as $base) { if ($base->hasAttribute("href")) { $href = $base->getAttribute('href'); # Convert all relative Links to absolute Ones $href = $this->convertRelativeToAbsoluteLink($href); $this->baseUrl = $href; # Delete Base Tag } $base->parentNode->removeChild($base); } # First things first. Let's change all a Tags that can define a target Attribute foreach ($dom->getElementsByTagName('a') as $link) { if ($link->hasAttribute("href")) { if (stripos($link->getAttribute("href"), "#") === 0) { continue; } elseif (stripos($link->getAttribute("href"), "javascript:") === 0) { $link->setAttribute("href", ""); } else { # All Links within a "a" Tag need to target the top level because they change the site on click $this->convertTargetAttribute($link, "_top"); # Convert all relative Links to absolute Ones $link->setAttribute("href", $this->convertRelativeToAbsoluteLink($link->getAttribute("href"))); # Convert all Links to the proxified Version # All of this Links should target to the top Level $link->setAttribute("href", $this->proxifyUrl($link->getAttribute("href"), true)); } } } # All Buttons foreach ($dom->getElementsByTagName('button') as $button) { if ($button->hasAttribute("formtarget")) { $button->setAttribute("formtarget", "_top"); } if ($button->hasAttribute("formaction")) { $formaction = $button->getAttribute("formaction"); # Rel to abs $formaction = $this->convertRelativeToAbsoluteLink($formaction); # Abs to proxified $formaction = $this->proxifyUrl($formaction, true); # And replace $button->setAttribute("formaction", $formaction); } # Since when are buttons allowed to have a href? # Youtube has such on it's site so we are converting it anyways if ($button->hasAttribute("href")) { $href = $button->getAttribute("href"); # Rel to abs $href = $this->convertRelativeToAbsoluteLink($href); # Abs to proxified $href = $this->proxifyUrl($href, true); # And replace $button->setAttribute("href", $href); } } foreach ($dom->getElementsByTagName('area') as $area) { # All Links within a "a" Tag need to target the top level because they change the site on click $this->convertTargetAttribute($area, "_top"); if ($area->hasAttribute("href")) { $href = $area->getAttribute("href"); # Rel to abs $href = $this->convertRelativeToAbsoluteLink($href); # Abs to proxified $href = $this->proxifyUrl($href, true); # And replace $area->setAttribute("href", $href); } } foreach ($dom->getElementsByTagName('form') as $form) { # All Links within a "a" Tag need to target the top level because they change the site on click $this->convertTargetAttribute($form, "_top"); # If a Form doesn't define a action It references itself but we need to set the link then $action = $form->getAttribute("action"); if ($action === "") { $action = $this->baseUrl; } else { # Otherwise the Link could be relative and we need to change it: # Convert all relative Links to absolute Ones $action = $this->convertRelativeToAbsoluteLink($action); } # # And finally Proxify the Url $action = $this->proxifyFormAction($action); $form->setAttribute("action", $action); } # Alle Link Tags foreach ($dom->getElementsByTagName('link') as $link) { # Convert all relative Links to absolute Ones $link->setAttribute("href", $this->convertRelativeToAbsoluteLink($link->getAttribute("href"))); # Convert all Links to the proxified Version # All of this Links should NOT target to the top Level $link->setAttribute("href", $this->proxifyUrl($link->getAttribute("href"), false)); // We will proxify any File that is linked here which changes its content // Integrity will therefor fail and as we do not know the correct content yet it needs to be removed $link->removeAttribute("integrity"); } # All Iframes foreach ($dom->getElementsByTagName('iframe') as $iframe) { # There can be 2 Possible sources # A - The src Attribute defines a Url that the Iframe loads $src = $iframe->getAttribute("src"); if ($src !== "") { # Make the Link absolute $src = $this->convertRelativeToAbsoluteLink($src); # Proxify the Link $src = $this->proxifyUrl($src, false); # Replace the old Link $iframe->setAttribute("src", $src); } # B - The srcdoc Attribute defines Html-Code that should be displayed in the frame $srcdoc = $iframe->getAttribute("srcdoc"); if ($srcdoc !== "") { # The srcdoc should be a HTML String so we are gonna make a new HTML-Document Element $htmlDoc = new HtmlDocument($this->password, $this->baseUrl, $srcdoc); $htmlDoc->proxifyContent(); $srcdoc = $htmlDoc->getResult(); # Replace the Old HTML Code $iframe->setAttribute("srcdoc", $srcdoc); } } # All Frames foreach ($dom->getElementsByTagName('frame') as $frame) { # There can be 2 Possible sources # A - The src Attribute defines a Url that the Iframe loads $src = $frame->getAttribute("src"); if ($src !== "") { # Make the Link absolute $src = $this->convertRelativeToAbsoluteLink($src); # Proxify the Link $src = $this->proxifyUrl($src, false); # Replace the old Link $frame->setAttribute("src", $src); } # B - The srcdoc Attribute defines Html-Code that should be displayed in the frame $srcdoc = $frame->getAttribute("srcdoc"); if ($srcdoc !== "") { # The srcdoc should be a HTML String so we are gonna make a new HTML-Document Element $htmlDoc = new HtmlDocument($this->password, $this->baseUrl, $srcdoc); $htmlDoc->proxifyContent(); $srcdoc = $htmlDoc->getResult(); # Replace the Old HTML Code $iframe->setAttribute("srcdoc", $srcdoc); } } # All Image Tags foreach ($dom->getElementsByTagName('img') as $img) { # Convert all Image src's to Absolute Links $img->setAttribute("src", $this->convertRelativeToAbsoluteLink($img->getAttribute("src"))); # Convert all Image Sources to proxified Versions $img->setAttribute("src", $this->proxifyUrl($img->getAttribute("src"), false)); # Some Images might contain a srcset (Different Images for different resolutions) # Syntax would be i.e. srcset="medium.jpg 1000w, large.jpg 2000w" $srcset = $img->getAttribute("srcset"); if ($srcset !== "") { $images = explode(",", $srcset); foreach ($images as $index => $set) { $set = trim($set); $parts = preg_split("/\s+/si", $set); # $parts[0] is the Image Path # It could be relative so convert that one: $parts[0] = $this->convertRelativeToAbsoluteLink($parts[0]); # And now Proxify it: $parts[0] = $this->proxifyUrl($parts[0], false); $images[$index] = implode(" ", $parts); } $srcset = implode(",", $images); $img->setAttribute("srcset", $srcset); } } # All Input Elements foreach ($dom->getElementsByTagName('input') as $input) { if ($input->hasAttribute("src")) { # Convert all Image src's to Absolute Links $input->setAttribute("src", $this->convertRelativeToAbsoluteLink($input->getAttribute("src"))); # input all Image Sources to proxified Versions $input->setAttribute("src", $this->proxifyUrl($input->getAttribute("src"), false)); } } # All Source Tags foreach ($dom->getElementsByTagName('source') as $img) { if ($img->hasAttribute("src")) { # Convert all Image src's to Absolute Links $img->setAttribute("src", $this->convertRelativeToAbsoluteLink($img->getAttribute("src"))); # Convert all Image Sources to proxified Versions $img->setAttribute("src", $this->proxifyUrl($img->getAttribute("src"), false)); } # Some Images might contain a srcset (Different Images for different resolutions) # Syntax would be i.e. srcset="medium.jpg 1000w, large.jpg 2000w" $srcset = $img->getAttribute("srcset"); if ($srcset !== "") { $images = explode(",", $srcset); foreach ($images as $index => $set) { $set = trim($set); $parts = preg_split("/\s+/si", $set); # $parts[0] is the Image Path # It could be relative so convert that one: $parts[0] = $this->convertRelativeToAbsoluteLink($parts[0]); # And now Proxify it: $parts[0] = $this->proxifyUrl($parts[0], false); $images[$index] = implode(" ", $parts); } $srcset = implode(",", $images); $img->setAttribute("srcset", $srcset); } } # Alle Meta Tags foreach ($dom->getElementsByTagName('meta') as $meta) { if ($meta->hasAttribute("href")) { # Convert all relative Links to absolute Ones $meta->setAttribute("href", $this->convertRelativeToAbsoluteLink($meta->getAttribute("href"))); # Convert all Links to the proxified Version # All of this Links should NOT target to the top Level $meta->setAttribute("href", $this->proxifyUrl($meta->getAttribute("href"), false)); } if ($meta->hasAttribute("http-equiv") && $meta->getAttribute("http-equiv") === "refresh") { # We should refresh the site with a meta tag # But not before profifying the new URL $content = $meta->getAttribute("content"); $url = substr($content, stripos($content, "url=") + 4); # Convert all relative Links to absolute Ones $url = $this->convertRelativeToAbsoluteLink($url); # Convert all Links to the proxified Version # All of this Links should NOT target to the top Level $url = $this->proxifyUrl($url, false); $content = substr($content, 0, stripos($content, "url=") + 4) . $url; $meta->setAttribute("content", $content); } } # Alle Script Tags foreach ($dom->getElementsByTagName('script') as $script) { $script->nodeValue = ""; $script->setAttribute("src", ""); $script->setAttribute("type", ""); } # Alle Style Blöcke # Werden extra geparsed foreach ($dom->getElementsByTagName('style') as $style) { $styleString = $style->nodeValue; $cssElement = new CssDocument($this->password, $this->baseUrl, $styleString); $cssElement->proxifyContent(); $style->nodeValue = $cssElement->getResult(); } # Nun alle Video Tags foreach ($dom->getElementsByTagName("video") as $video) { if ($video->hasAttribute("src")) { # Convert all relative Links to absolute Ones $video->setAttribute("src", $this->convertRelativeToAbsoluteLink($video->getAttribute("src"))); # Convert all Links to the proxified Version # All of this Links should NOT target to the top Level $video->setAttribute("src", $this->proxifyUrl($video->getAttribute("src"), false)); } if ($video->hasAttribute("poster")) { # Convert all relative Links to absolute Ones $video->setAttribute("poster", $this->convertRelativeToAbsoluteLink($video->getAttribute("poster"))); # Convert all Links to the proxified Version # All of this Links should NOT target to the top Level $video->setAttribute("poster", $this->proxifyUrl($video->getAttribute("poster"), false)); } } # Abschließend gehen wir noch einmal alle Tags durch foreach ($dom->getElementsByTagName('*') as $el) { if ($el->getAttribute("style") !== "") { $styleString = $el->getAttribute("style"); $cssElement = new CssDocument($this->password, $this->baseUrl, $styleString); $cssElement->proxifyContent(); $el->setAttribute("style", $cssElement->getResult()); } # Some old sites might use the background attribute Let's parse them, too if ($el->hasAttribute("background")) { # Convert all relative Links to absolute Ones $el->setAttribute("background", $this->convertRelativeToAbsoluteLink($el->getAttribute("background"))); # Convert all Links to the proxified Version # All of this Links should NOT target to the top Level $el->setAttribute("background", $this->proxifyUrl($el->getAttribute("background"), false)); } # We Will Remove all Javascript Event attributes # To keep things simple we're gonna remove all Attributes which names start with "on" foreach ($el->attributes as $attr) { if (stripos($attr->name, "on") === 0) { $el->removeAttribute($attr->name); } } } $this->htmlString = $dom->saveHtml(); # Remove all now empty script Tags $this->htmlString = preg_replace("/<\s*[\/]{0,1}\s*script[^>]*?>/si", "", $this->htmlString); libxml_use_internal_errors(false); } /** * This function changes the current Target Attribute on the link to given new target Attribute */ private function convertTargetAttribute($link, $newTarget) { $link->setAttribute("target", $newTarget); } }