Nettoyer le code HTML pour LibreOffice Writer

Ce bookmarklet permet de nettoyer le code HTML copié d’une page web pour pouvoir le coller dans un document LibreOffice Writer

Nettoyage du code HTML

Le nettoyage du code HTML est effectué avec les caractéristiques suivantes :

Seules les balises p, h1, h2, h3, h4, h5, h6, strong, em, ul, ol, li, sup, sub, a, img, code et pre sont conservées
Les autres balises sont retirées (mais leur contenu est conservé)
Les URL relatives sont converties en URL absolues
Les balises script et noscript ainsi que leur contenu sont supprimées
Les images sont converties en data-url afin de pouvoir être copiées
Des règles typographiques françaises sont appliquées

Installation

Cette extension se présente sous la format d’un bookmarklet. Pour pouvoir l’installer, glissez-déposer le lien ci-dessous dans la barre des favoris de votre navigateur

Clean HTML

Ne cliquez pas sur ce lien depuis cette fenêtre ! Glissez-déposer le dans votre barre de favoris.

Une fois installé, allez sur une page, sélectionnez le texte et cliquez sur le lien dans votre barre de favoris.

Code source

javascript

(function () {
    // List of attributes that are considered useful.
    const usefulAttributes = ["href", "src", "alt"];

    /**
     * Checks if an attribute is useful.
     * @param {Attr} attribute - The attribute to check.
     * @returns {boolean} True if the attribute is useful, false otherwise.
     */
    function isUsefulAttribute(attribute) {
        return (usefulAttributes.indexOf(attribute.name.toLowerCase()) !== -1);
    }

    // List of tags that are considered bad and should be removed.
    const badTags = ["script", "noscript", "link", "style"];

    /**
     * Checks if an element is a bad tag.
     * @param {Element} element - The element to check.
     * @returns {boolean} True if the element is a bad tag, false otherwise.
     */
    function isBadTag(element) {
        return (element.nodeType === Node.ELEMENT_NODE)
            && (badTags.indexOf(element.nodeName.toLowerCase()) !== -1);
    }

    // List of tags that are considered useful
    const usefulTags = [
        "p",
        "h1",
        "h2",
        "h3",
        "h4",
        "h5",
        "h6",
        "strong",
        "em",
        "ul",
        "ol",
        "li",
        "sup",
        "sub",
        "a",
        "img",
        "pre",
        "code"
    ];

    /**
     * Checks if an element is a useful tag.
     * @param {Element} element - The element to check.
     * @returns {boolean} True if the element is a useful tag, false otherwise.
     */
    function isUsefulTag(element) {
        return (element.nodeType === Node.ELEMENT_NODE)
            && (usefulTags.indexOf(element.nodeName.toLowerCase()) !== -1);
    }

    /**
     * Checks if an element is a link.
     * @param {Element} element - The element to check.
     * @returns {boolean} True if the element is a link, false otherwise.
     */
    function isALink(element) {
        return (element.nodeType === Node.ELEMENT_NODE)
            && (element.nodeName.toLowerCase() === "a")
            && (element.hasAttribute("href"));
    }

    /**
     * Checks if an element is an image.
     * @param {Element} element - The element to check.
     * @returns {boolean} True if the element is an image, false otherwise.
     */
    function isAnImage(element) {
        return (element.nodeType === Node.ELEMENT_NODE)
            && (element.nodeName.toLowerCase() === "img");
    }

    /**
     * Converts an image URL to a base64 data URL.
     * @param {string} src - The source URL of the image.
     * @param {function} callback - The callback function to handle the data
     * URL.
     */
    function convertImage(src, callback) {
        const image = new Image();
        image.crossOrigin = "Anonymous";
        image.onload = function () {
            const canvas = document.createElement("canvas");
            const context = canvas.getContext("2d");
            canvas.height = image.naturalHeight;
            canvas.width = image.naturalWidth;
            context.drawImage(image, 0, 0);
            const dataURL = canvas.toDataURL("image/png");
            callback(dataURL);
        };

        image.src = src;
    }

    // French typographic rules.
    const typographicRules = [
        [/ +/g, " "],
        [/'/g, "’"],
        [/ ([?!%€$»:;])/g, " $1"],
        [/« /g, "« "],
        [/\.\.\./g, "…"],
        [/([0-9]) ?([%€$£])/g, "$1 $2"],
        [/([0-9]{1,3})[ .]([0-9]{3})[ .]([0-9]{3})/g, "$1$2$3"],
        [/([0-9]{1,3})[ .]([0-9]{3})/g, "$1$2"]
    ];

    /**
     * Apply french typographic rules to a string.
     * @param {*} str
     * @returns {string} The string with typographic rules applied.
     */
    function applyTypography(str) {
        function apply(accumulator, rule) {
            const [regex, replacement] = rule;
            return accumulator.replace(regex, replacement);
        }

        return typographicRules.reduce(apply, str);
    }

    /**
     * Clean tags from an HTML document in order to include the result in
     * another document without importing styles or tags not useful.
     * It aims at keeping only semantic structures.
     * @param {DocumentFragment} fragment The document fragment to clean.
     * @returns {DocumentFragment} The cleaned document fragment.
     */
    function cleanTags(fragment) {
        function setSource(element) {
            return function (src) {
                element.setAttribute("src", src);
            };
        }

        let cleaned;

        // Ignore bad tags.
        if (isBadTag(fragment)) {
            return document.createDocumentFragment();
        }

        if (isUsefulTag(fragment)) {
            cleaned = document.createElement(fragment.nodeName);

            // Clean attributes.
            Array.from(fragment.attributes)
                .filter(isUsefulAttribute)
                .forEach(function (attribute) {
                    return cleaned.setAttributeNode(attribute.cloneNode());
                });

            // Embed any image.
            if (isAnImage(fragment)) {
                convertImage(fragment.getAttribute("src"), setSource(cleaned));
            } else if (isALink(fragment)) {
                // Convert any relative link to an absolute link.
                const absoluteLink = new URL(
                    fragment.getAttribute("href"),
                    fragment.baseURI
                ).href;
                cleaned.setAttribute("href", absoluteLink);
            }
        } else if (fragment.nodeType === Node.TEXT_NODE) {
            // Apply typographic rules.
            cleaned = document.createTextNode(applyTypography(fragment.data));
        } else {
            // Anything else will be converted to a blank document fragment.
            cleaned = document.createDocumentFragment();
        }

        // Recursively clean children of the current node.
        Array.from(fragment.childNodes).forEach(function (child) {
            return cleaned.appendChild(cleanTags(child));
        });

        return cleaned;
    }

    /**
     * Open a popup whose content will be a document fragment given as argument.
     * @param {DocumentFragment} fragment
     */
    function createPopup(fragment) {
        const copyTab = window.open("", "_blank", "popup").document;
        copyTab.title = "CleanHTML";
        copyTab.body.appendChild(fragment);
        copyTab.close();
    }

    /**
     * Clone the selection into a document fragment.
     * @returns {DocumentFragment} The cloned selection or null.
     */
    function cloneSelection() {
        // This script needs a selection from the user to work.
        const selection = window.getSelection();
        if (selection.rangeCount < 1) {
            return null;
        }

        // Copy the selection to a document fragment.
        const elements = document.createDocumentFragment();
        Array.from({ length: selection.rangeCount }).forEach(function (_, i) {
            elements.appendChild(selection.getRangeAt(i).cloneContents());
        });

        return elements;
    }

    const selection = cloneSelection();
    if (selection !== null) {
        createPopup(cleanTags(selection));
    }
})();