Skip to content

Commit

Permalink
allow raw HTML markup for a few (whitelisted) tags
Browse files Browse the repository at this point in the history
To be compatible with comments from Disqus (and users unfamiliar with
Markdown), Misaka no longer disables user-inputted HTML, but the
generated HTML is now post-processed and all "unsafe" tags (not
possible with Markdown) are discarded.

Whitelist: p, a, pre, blockquote, h1-h6, em, sub, sup, del, ins, math,
           dl, ol, ul, li

This commit also removes an unnecessary newline generated by
Misaka/Sundown.

Conflicts:
	isso/utils/__init__.py
  • Loading branch information
posativ committed Jan 12, 2014
1 parent 48e7ddb commit 104afa8
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 8 deletions.
73 changes: 70 additions & 3 deletions isso/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,19 @@
import pkg_resources
werkzeug = pkg_resources.get_distribution("werkzeug")

import io
import json
import random
import hashlib

from string import ascii_letters, digits

try:
from html.parser import HTMLParser, HTMLParseError
except ImportError:
from HTMLParser import HTMLParser, HTMLParseError

from werkzeug.utils import escape
from werkzeug.wrappers import Request
from werkzeug.exceptions import BadRequest

Expand Down Expand Up @@ -126,13 +133,69 @@ def get_json(self):
raise BadRequest('Unable to read JSON request')


class Sanitizer(HTMLParser, object):
"""Sanitize HTML output: remove unsafe HTML tags such as iframe or
script based on a whitelist of allowed tags."""

safe = set([
"p", "a", "pre", "blockquote",
"h1", "h2", "h3", "h4", "h5", "h6",
"em", "sub", "sup", "del", "ins", "math",
"dl", "ol", "ul", "li"])

@classmethod
def format(cls, attrs):
res = []
for key, value in attrs:
if value is None:
res.append(key)
else:
res.append(u'{0}="{1}"'.format(key, escape(value)))
return ' '.join(res)

def __init__(self, html):
super(Sanitizer, self).__init__()
self.result = io.StringIO()
self.feed(html)
self.result.seek(0)

def handle_starttag(self, tag, attrs):
if tag in Sanitizer.safe:
self.result.write(u"<" + tag)
if attrs:
self.result.write(" " + Sanitizer.format(attrs))
self.result.write(u">")

def handle_data(self, data):
self.result.write(data)

def handle_endtag(self, tag):
if tag in Sanitizer.safe:
self.result.write(u"</" + tag + ">")

def handle_startendtag(self, tag, attrs):
if tag in Sanitizer.safe:
self.result.write(u"<" + tag)
if attrs:
self.result.write(" " + Sanitizer.format(attrs))
self.result.write(u"/>")

def handle_entityref(self, name):
self.result.write(u'&' + name + ';')

def handle_charref(self, char):
self.result.write(u'&#' + char + ';')


def markdown(text):
"""Convert Markdown to (safe) HTML.
>>> markdown("*Ohai!*") # doctest: +IGNORE_UNICODE
'<p><em>Ohai!</em></p>'
>>> markdown("<em>Hi</em>") # doctest: +IGNORE_UNICODE
'<p><em>Hi</em></p>'
>>> markdown("<script>alert('Onoe')</script>") # doctest: +IGNORE_UNICODE
'<p>alert(&#39;Onoe&#39;)</p>'
"<p>alert('Onoe')</p>"
>>> markdown("http://example.org/ and sms:+1234567890") # doctest: +IGNORE_UNICODE
'<p><a href="http://example.org/">http://example.org/</a> and sms:+1234567890</p>'
"""
Expand All @@ -141,9 +204,13 @@ def markdown(text):
exts = misaka.EXT_STRIKETHROUGH | misaka.EXT_SUPERSCRIPT | misaka.EXT_AUTOLINK

# remove HTML tags, skip <img> (for now) and only render "safe" protocols
html = misaka.HTML_SKIP_HTML | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK
html = misaka.HTML_SKIP_STYLE | misaka.HTML_SKIP_IMAGES | misaka.HTML_SAFELINK

rv = misaka.html(text, extensions=exts, render_flags=html).rstrip("\n")
if not rv.startswith("<p>") and not rv.endswith("</p>"):
rv = "<p>" + rv + "</p>"

return misaka.html(text, extensions=exts, render_flags=html).strip("\n")
return Sanitizer(rv).result.read()


def origin(hosts):
Expand Down
10 changes: 5 additions & 5 deletions specs/test_comments.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def testGet(self):
rv = loads(r.data)

assert rv['id'] == 1
assert rv['text'] == '<p>Lorem ipsum ...</p>\n'
assert rv['text'] == '<p>Lorem ipsum ...</p>'

def testCreate(self):

Expand All @@ -66,7 +66,7 @@ def testCreate(self):
rv = loads(rv.data)

assert rv["mode"] == 1
assert rv["text"] == '<p>Lorem ipsum ...</p>\n'
assert rv["text"] == '<p>Lorem ipsum ...</p>'

def textCreateWithNonAsciiText(self):

Expand All @@ -78,7 +78,7 @@ def textCreateWithNonAsciiText(self):
rv = loads(rv.data)

assert rv["mode"] == 1
assert rv["text"] == '<p>Здравствуй, мир!</p>\n'
assert rv["text"] == '<p>Здравствуй, мир!</p>'

def testCreateMultiple(self):

Expand Down Expand Up @@ -261,10 +261,10 @@ def testModify(self):
self.post('/new?uri=test', data=json.dumps({"text": "Tpyo"}))

self.put('/id/1', data=json.dumps({"text": "Tyop"}))
assert loads(self.get('/id/1').data)["text"] == "<p>Tyop</p>\n"
assert loads(self.get('/id/1').data)["text"] == "<p>Tyop</p>"

self.put('/id/1', data=json.dumps({"text": "Typo"}))
assert loads(self.get('/id/1').data)["text"] == "<p>Typo</p>\n"
assert loads(self.get('/id/1').data)["text"] == "<p>Typo</p>"

def testDeleteCommentRemovesThread(self):

Expand Down

0 comments on commit 104afa8

Please # to comment.