Skip to content
Snippets Groups Projects
HtmlDocument.php 17.54 KiB
<?php

namespace App;

use DomDocument;

class HtmlDocument extends Document
{

    private $htmlString;

    public function __construct($password, $baseUrl, $htmlString, $encoding)
    {
        parent::__construct($password, $baseUrl);
        $this->htmlString = $this->convertEncoding($htmlString, $encoding);
    }

    public function convertEncoding($htmlString, $encoding)
    {
        # If the site sets the document encoding in the "content-type" Header the $encoding variable is not null
        if ($encoding == null && !empty($htmlString)) {
            # Otherwise we will try to extract the correct encoding from the meta tag
            # Let's create a new DOM
            libxml_use_internal_errors(true);
            $dom = new DomDocument();
            $dom->loadHtml($htmlString);

            foreach ($dom->getElementsByTagName("meta") as $meta) {
                # If there is a Content-Type Meta Tag
                if ($meta->hasAttribute("http-equiv") && strtolower($meta->getAttribute("http-equiv")) === "content-type" && $meta->hasAttribute("content")) {
                    $contentType = $meta->getAttribute("content");
                    $encoding = stripos($contentType, "charset=") !== false ? trim(substr($contentType, stripos($contentType, "charset=") + 8)) : null;
                    $contentEncoding = rtrim($contentEncoding, ";");
                    if ($encoding !== null) {
                        break;
                    }

                }
                # If there is a Charset Meta Tag
                if ($meta->hasAttribute("charset")) {
                    $encoding = $meta->getAttribute("charset");
                    break;
                }
            }
            if ($encoding === null) {
                $encoding = "UTF-8";
            }
            # Default Fallback
        }
        if (!empty($encoding)) {
            return mb_convert_encoding($htmlString, 'HTML-ENTITIES', $encoding);
        } else {
            return mb_convert_encoding($htmlString, 'HTML-ENTITIES');
        }
    }

    public function getResult()
    {
        return $this->htmlString;
    }

    /**
     * Function proxifyContent
     * This method parses the given String and Proxifies all Links/Urls in it so it's targetting this Proxy Server
     **/
    public function proxifyContent()
    {
        if (trim($this->htmlString) === "") {
            return;
        }

        # Let's create a new DOM
        libxml_use_internal_errors(true);
        $dom = new DomDocument();
        $dom->loadHtml($this->htmlString);

        foreach ($dom->getElementsByTagName('base') as $base) {
            if ($base->hasAttribute("href")) {
                $href = $base->getAttribute('href');
                # Convert all relative Links to absolute Ones
                $href = $this->convertRelativeToAbsoluteLink($href);
                $this->baseUrl = $href;
                # Delete Base Tag
            }
            $base->parentNode->removeChild($base);
        }

        # First things first. Let's change all a Tags that can define a target Attribute
        foreach ($dom->getElementsByTagName('a') as $link) {
            if ($link->hasAttribute("href")) {
                if (stripos($link->getAttribute("href"), "#") === 0) {
                    continue;
                } elseif (stripos($link->getAttribute("href"), "javascript:") === 0) {
                    $link->setAttribute("href", "");
                } else {
                    # All Links within a "a" Tag need to target the top level because they change the site on click
                    $this->convertTargetAttribute($link, "_top");
                    # Convert all relative Links to absolute Ones
                    $link->setAttribute("href", $this->convertRelativeToAbsoluteLink($link->getAttribute("href")));
                    # Convert all Links to the proxified Version
                    # All of this Links should target to the top Level
                    $link->setAttribute("href", $this->proxifyUrl($link->getAttribute("href"), true));
                }
            }

        }

        # All Buttons
        foreach ($dom->getElementsByTagName('button') as $button) {
            if ($button->hasAttribute("formtarget")) {
                $button->setAttribute("formtarget", "_top");
            }
            if ($button->hasAttribute("formaction")) {
                $formaction = $button->getAttribute("formaction");
                # Rel to abs
                $formaction = $this->convertRelativeToAbsoluteLink($formaction);
                # Abs to proxified
                $formaction = $this->proxifyUrl($formaction, true);
                # And replace
                $button->setAttribute("formaction", $formaction);
            }
            # Since when are buttons allowed to have a href?
            # Youtube has such on it's site so we are converting it anyways
            if ($button->hasAttribute("href")) {
                $href = $button->getAttribute("href");
                # Rel to abs
                $href = $this->convertRelativeToAbsoluteLink($href);
                # Abs to proxified
                $href = $this->proxifyUrl($href, true);
                # And replace
                $button->setAttribute("href", $href);
            }

        }

        foreach ($dom->getElementsByTagName('area') as $area) {
            # All Links within a "a" Tag need to target the top level because they change the site on click
            $this->convertTargetAttribute($area, "_top");
            if ($area->hasAttribute("href")) {
                $href = $area->getAttribute("href");
                # Rel to abs
                $href = $this->convertRelativeToAbsoluteLink($href);
                # Abs to proxified
                $href = $this->proxifyUrl($href, true);
                # And replace
                $area->setAttribute("href", $href);
            }
        }

        foreach ($dom->getElementsByTagName('form') as $form) {
            # All Links within a "a" Tag need to target the top level because they change the site on click
            $this->convertTargetAttribute($form, "_top");
            # If a Form doesn't define a action It references itself but we need to set the link then
            $action = $form->getAttribute("action");
            if ($action === "") {
                $action = $this->baseUrl;
            } else {
                # Otherwise the Link could be relative and we need to change it:
                # Convert all relative Links to absolute Ones
                $action = $this->convertRelativeToAbsoluteLink($action);
            }
            #
            # And finally Proxify the Url
            $action = $this->proxifyUrl($action, true);
            $form->setAttribute("action", $action);
        }

        # Alle Link Tags
        foreach ($dom->getElementsByTagName('link') as $link) {
            # Convert all relative Links to absolute Ones
            $link->setAttribute("href", $this->convertRelativeToAbsoluteLink($link->getAttribute("href")));
            # Convert all Links to the proxified Version
            # All of this Links should NOT target to the top Level
            $link->setAttribute("href", $this->proxifyUrl($link->getAttribute("href"), false));
        }

        # All Iframes
        foreach ($dom->getElementsByTagName('iframe') as $iframe) {
            # There can be 2 Possible sources
            # A - The src Attribute defines a Url that the Iframe loads
            $src = $iframe->getAttribute("src");
            if ($src !== "") {
                # Make the Link absolute
                $src = $this->convertRelativeToAbsoluteLink($src);
                # Proxify the Link
                $src = $this->proxifyUrl($src, false);
                # Replace the old Link
                $iframe->setAttribute("src", $src);
            }
            # B - The srcdoc Attribute defines Html-Code that should be displayed in the frame
            $srcdoc = $iframe->getAttribute("srcdoc");
            if ($srcdoc !== "") {
                # The srcdoc should be a HTML String so we are gonna make a new HTML-Document Element
                $htmlDoc = new HtmlDocument($this->password, $this->baseUrl, $srcdoc);
                $htmlDoc->proxifyContent();
                $srcdoc = $htmlDoc->getResult();
                # Replace the Old HTML Code
                $iframe->setAttribute("srcdoc", $srcdoc);
            }
        }

        # All Frames
        foreach ($dom->getElementsByTagName('frame') as $frame) {
            # There can be 2 Possible sources
            # A - The src Attribute defines a Url that the Iframe loads
            $src = $frame->getAttribute("src");
            if ($src !== "") {
                # Make the Link absolute
                $src = $this->convertRelativeToAbsoluteLink($src);
                # Proxify the Link
                $src = $this->proxifyUrl($src, false);
                # Replace the old Link
                $frame->setAttribute("src", $src);
            }
            # B - The srcdoc Attribute defines Html-Code that should be displayed in the frame
            $srcdoc = $frame->getAttribute("srcdoc");
            if ($srcdoc !== "") {
                # The srcdoc should be a HTML String so we are gonna make a new HTML-Document Element
                $htmlDoc = new HtmlDocument($this->password, $this->baseUrl, $srcdoc);
                $htmlDoc->proxifyContent();
                $srcdoc = $htmlDoc->getResult();
                # Replace the Old HTML Code
                $iframe->setAttribute("srcdoc", $srcdoc);
            }
        }

        # All Image Tags
        foreach ($dom->getElementsByTagName('img') as $img) {
            # Convert all Image src's to Absolute Links
            $img->setAttribute("src", $this->convertRelativeToAbsoluteLink($img->getAttribute("src")));
            # Convert all Image Sources to proxified Versions
            $img->setAttribute("src", $this->proxifyUrl($img->getAttribute("src"), false));
            # Some Images might contain a srcset (Different Images for different resolutions)
            # Syntax would be i.e. srcset="medium.jpg 1000w, large.jpg 2000w"
            $srcset = $img->getAttribute("srcset");
            if ($srcset !== "") {
                $images = explode(",", $srcset);
                foreach ($images as $index => $set) {
                    $set = trim($set);
                    $parts = preg_split("/\s+/si", $set);
                    # $parts[0] is the Image Path
                    # It could be relative so convert that one:
                    $parts[0] = $this->convertRelativeToAbsoluteLink($parts[0]);

                    # And now Proxify it:
                    $parts[0] = $this->proxifyUrl($parts[0], false);
                    $images[$index] = implode(" ", $parts);
                }
                $srcset = implode(",", $images);
                $img->setAttribute("srcset", $srcset);
            }
        }

        # All Input Elements
        foreach ($dom->getElementsByTagName('input') as $input) {
            if ($input->hasAttribute("src")) {
                # Convert all Image src's to Absolute Links
                $input->setAttribute("src", $this->convertRelativeToAbsoluteLink($input->getAttribute("src")));
                # input all Image Sources to proxified Versions
                $input->setAttribute("src", $this->proxifyUrl($input->getAttribute("src"), false));
            }
        }

        # All Source Tags
        foreach ($dom->getElementsByTagName('source') as $img) {
            if ($img->hasAttribute("src")) {
                # Convert all Image src's to Absolute Links
                $img->setAttribute("src", $this->convertRelativeToAbsoluteLink($img->getAttribute("src")));
                # Convert all Image Sources to proxified Versions
                $img->setAttribute("src", $this->proxifyUrl($img->getAttribute("src"), false));

            }
            # Some Images might contain a srcset (Different Images for different resolutions)
            # Syntax would be i.e. srcset="medium.jpg 1000w, large.jpg 2000w"
            $srcset = $img->getAttribute("srcset");
            if ($srcset !== "") {
                $images = explode(",", $srcset);
                foreach ($images as $index => $set) {
                    $set = trim($set);
                    $parts = preg_split("/\s+/si", $set);
                    # $parts[0] is the Image Path
                    # It could be relative so convert that one:
                    $parts[0] = $this->convertRelativeToAbsoluteLink($parts[0]);

                    # And now Proxify it:
                    $parts[0] = $this->proxifyUrl($parts[0], false);
                    $images[$index] = implode(" ", $parts);
                }
                $srcset = implode(",", $images);
                $img->setAttribute("srcset", $srcset);
            }
        }

        # Alle Meta Tags
        foreach ($dom->getElementsByTagName('meta') as $meta) {
            if ($meta->hasAttribute("href")) {
                # Convert all relative Links to absolute Ones
                $meta->setAttribute("href", $this->convertRelativeToAbsoluteLink($meta->getAttribute("href")));
                # Convert all Links to the proxified Version
                # All of this Links should NOT target to the top Level
                $meta->setAttribute("href", $this->proxifyUrl($meta->getAttribute("href"), false));
            }
            if ($meta->hasAttribute("http-equiv") && $meta->getAttribute("http-equiv") === "refresh") {
                # We should refresh the site with a meta tag
                # But not before profifying the new URL
                $content = $meta->getAttribute("content");
                $url = substr($content, stripos($content, "url=") + 4);
                # Convert all relative Links to absolute Ones
                $url = $this->convertRelativeToAbsoluteLink($url);

                # Convert all Links to the proxified Version
                # All of this Links should NOT target to the top Level
                $url = $this->proxifyUrl($url, false);

                $content = substr($content, 0, stripos($content, "url=") + 4) . $url;

                $meta->setAttribute("content", $content);
            }
        }

        # Alle Script Tags
        foreach ($dom->getElementsByTagName('script') as $script) {
            $script->nodeValue = "";
            $script->setAttribute("src", "");
            $script->setAttribute("type", "");
        }

        # Alle Style Blöcke
        # Werden extra geparsed
        foreach ($dom->getElementsByTagName('style') as $style) {
            $styleString = $style->nodeValue;
            $cssElement = new CssDocument($this->password, $this->baseUrl, $styleString);
            $cssElement->proxifyContent();
            $style->nodeValue = $cssElement->getResult();
        }

        # Nun alle Video Tags
        foreach ($dom->getElementsByTagName("video") as $video) {
            if ($video->hasAttribute("src")) {
                # Convert all relative Links to absolute Ones
                $video->setAttribute("src", $this->convertRelativeToAbsoluteLink($video->getAttribute("src")));
                # Convert all Links to the proxified Version
                # All of this Links should NOT target to the top Level
                $video->setAttribute("src", $this->proxifyUrl($video->getAttribute("src"), false));
            }
            if ($video->hasAttribute("poster")) {
                # Convert all relative Links to absolute Ones
                $video->setAttribute("poster", $this->convertRelativeToAbsoluteLink($video->getAttribute("poster")));
                # Convert all Links to the proxified Version
                # All of this Links should NOT target to the top Level
                $video->setAttribute("poster", $this->proxifyUrl($video->getAttribute("poster"), false));
            }
        }

        # Abschließend gehen wir noch einmal alle Tags durch
        foreach ($dom->getElementsByTagName('*') as $el) {
            if ($el->getAttribute("style") !== "") {
                $styleString = $el->getAttribute("style");
                $cssElement = new CssDocument($this->password, $this->baseUrl, $styleString);
                $cssElement->proxifyContent();
                $el->setAttribute("style", $cssElement->getResult());
            }

            # Some old sites might use the background attribute Let's parse them, too
            if ($el->hasAttribute("background")) {
                # Convert all relative Links to absolute Ones
                $el->setAttribute("background", $this->convertRelativeToAbsoluteLink($el->getAttribute("background")));
                # Convert all Links to the proxified Version
                # All of this Links should NOT target to the top Level
                $el->setAttribute("background", $this->proxifyUrl($el->getAttribute("background"), false));
            }

            # We Will Remove all Javascript Event attributes
            # To keep things simple we're gonna remove all Attributes which names start with "on"
            foreach ($el->attributes as $attr) {
                if (stripos($attr->name, "on") === 0) {
                    $el->removeAttribute($attr->name);
                }
            }
        }

        $this->htmlString = $dom->saveHtml();

        # Remove all now empty script Tags
        $this->htmlString = preg_replace("/<\s*[\/]{0,1}\s*script[^>]*?>/si", "", $this->htmlString);

        libxml_use_internal_errors(false);
    }

/**
 * This function changes the current Target Attribute on the link to given new target Attribute
 */
    private function convertTargetAttribute($link, $newTarget)
    {
        $link->setAttribute("target", $newTarget);
    }

}