Source code for mezzanine.utils.html

from __future__ import absolute_import, unicode_literals
from future.builtins import chr, int, str

try:
    from html.parser import HTMLParser
    from html.entities import name2codepoint
    try:
        from html.parser import HTMLParseError
    except ImportError:  # Python 3.5+
        class HTMLParseError(Exception):
            pass
except ImportError:  # Python 2
    from HTMLParser import HTMLParser, HTMLParseError
    from htmlentitydefs import name2codepoint

import re

from mezzanine.utils.deprecation import mark_safe


SELF_CLOSING_TAGS = ['br', 'img']
NON_SELF_CLOSING_TAGS = ['script', 'iframe']
ABSOLUTE_URL_TAGS = {"img": "src", "a": "href", "iframe": "src"}

# Tags and attributes added to richtext filtering whitelist when the
# RICHTEXT_FILTER_LEVEL is set to low. General use-case for these is
# allowing embedded video, but we will add to this fixed list over
# time as more use-cases come up. We won't ever add script tags or
# events (onclick etc) to this list. To enable those, filtering can
# be turned off in the settings admin.
LOW_FILTER_TAGS = ("iframe", "embed", "video", "param", "source", "object")
LOW_FILTER_ATTRS = ("allowfullscreen", "autostart", "loop", "hidden",
                    "playcount", "volume", "controls", "data", "classid")


[docs]def absolute_urls(html): """ Converts relative URLs into absolute URLs. Used for RSS feeds to provide more complete HTML for item descriptions, but could also be used as a general richtext filter. """ from bs4 import BeautifulSoup from mezzanine.core.request import current_request request = current_request() if request is not None: dom = BeautifulSoup(html, "html.parser") for tag, attr in ABSOLUTE_URL_TAGS.items(): for node in dom.findAll(tag): url = node.get(attr, "") if url: node[attr] = request.build_absolute_uri(url) html = str(dom) return html
[docs]def decode_entities(html): """ Remove HTML entities from a string. Adapted from http://effbot.org/zone/re-sub.htm#unescape-html """ def decode(m): html = m.group(0) if html[:2] == "&#": try: if html[:3] == "&#x": return chr(int(html[3:-1], 16)) else: return chr(int(html[2:-1])) except ValueError: pass else: try: html = chr(name2codepoint[html[1:-1]]) except KeyError: pass return html return re.sub("&#?\w+;", decode, html.replace("&", "&"))
[docs]@mark_safe def escape(html): """ Escapes HTML according to the rules defined by the settings ``RICHTEXT_FILTER_LEVEL``, ``RICHTEXT_ALLOWED_TAGS``, ``RICHTEXT_ALLOWED_ATTRIBUTES``, ``RICHTEXT_ALLOWED_STYLES``. """ from bleach import clean, ALLOWED_PROTOCOLS from mezzanine.conf import settings from mezzanine.core import defaults if settings.RICHTEXT_FILTER_LEVEL == defaults.RICHTEXT_FILTER_LEVEL_NONE: return html tags = settings.RICHTEXT_ALLOWED_TAGS attrs = settings.RICHTEXT_ALLOWED_ATTRIBUTES styles = settings.RICHTEXT_ALLOWED_STYLES if settings.RICHTEXT_FILTER_LEVEL == defaults.RICHTEXT_FILTER_LEVEL_LOW: tags += LOW_FILTER_TAGS attrs += LOW_FILTER_ATTRS if isinstance(attrs, tuple): attrs = list(attrs) return clean(html, tags=tags, attributes=attrs, strip=True, strip_comments=False, styles=styles, protocols=ALLOWED_PROTOCOLS + ["tel"])
[docs]@mark_safe def thumbnails(html): """ Given a HTML string, converts paths in img tags to thumbnail paths, using Mezzanine's ``thumbnail`` template tag. Used as one of the default values in the ``RICHTEXT_FILTERS`` setting. """ from django.conf import settings from bs4 import BeautifulSoup from mezzanine.core.templatetags.mezzanine_tags import thumbnail # If MEDIA_URL isn't in the HTML string, there aren't any # images to replace, so bail early. if settings.MEDIA_URL.lower() not in html.lower(): return html dom = BeautifulSoup(html, "html.parser") for img in dom.findAll("img"): src = img.get("src", "") src_in_media = src.lower().startswith(settings.MEDIA_URL.lower()) width = img.get("width") height = img.get("height") if src_in_media and str(width).isdigit() and str(height).isdigit(): img["src"] = settings.MEDIA_URL + thumbnail(src, width, height) # BS adds closing br tags, which the browser interprets as br tags. return str(dom).replace("</br>", "")
[docs]class TagCloser(HTMLParser): """ HTMLParser that closes open tags. Takes a HTML string as its first arg, and populate a ``html`` attribute on the parser with the original HTML arg and any required closing tags. """ def __init__(self, html): HTMLParser.__init__(self) self.html = html self.tags = [] try: self.feed(self.html) except HTMLParseError: pass else: self.html += "".join(["</%s>" % tag for tag in self.tags]) def handle_starttag(self, tag, attrs): if tag not in SELF_CLOSING_TAGS: self.tags.insert(0, tag) def handle_endtag(self, tag): try: self.tags.remove(tag) except ValueError: pass