This repository has been archived on 2025-05-04. You can view files and clone it, but you cannot make any changes to it's state, such as pushing and creating new issues, pull requests or comments.
s2forums/apps/forum/markups/postmarkup.py
2009-01-19 17:46:49 +02:00

1486 lines
44 KiB
Python

# -*- coding: UTF-8 -*-
"""
Post Markup
Author: Will McGugan (http://www.willmcgugan.com)
"""
__version__ = "1.1.4"
import re
from urllib import quote, unquote, quote_plus, urlencode
from urlparse import urlparse, urlunparse
pygments_available = True
try:
from pygments import highlight
from pygments.lexers import get_lexer_by_name, ClassNotFound
from pygments.formatters import HtmlFormatter
except ImportError:
# Make Pygments optional
pygments_available = False
def annotate_link(domain):
"""This function is called by the url tag. Override to disable or change behaviour.
domain -- Domain parsed from url
"""
return u" [%s]"%_escape(domain)
_re_url = re.compile(r"((https?):((//)|(\\\\))+[\w\d:#@%/;$()~_?\+-=\\\.&]*)", re.MULTILINE|re.UNICODE)
_re_html=re.compile('<.*?>|\&.*?\;', re.UNICODE)
def textilize(s):
"""Remove markup from html"""
return _re_html.sub("", s)
_re_excerpt = re.compile(r'\[".*?\]+?.*?\[/".*?\]+?', re.DOTALL|re.UNICODE)
_re_remove_markup = re.compile(r'\[.*?\]', re.DOTALL|re.UNICODE)
_re_break_groups = re.compile(r'\n+', re.DOTALL|re.UNICODE)
def get_excerpt(post):
"""Returns an excerpt between ["] and [/"]
post -- BBCode string"""
match = _re_excerpt.search(post)
if match is None:
return ""
excerpt = match.group(0)
excerpt = excerpt.replace(u'\n', u"<br/>")
return _re_remove_markup.sub("", excerpt)
def strip_bbcode(bbcode):
"""Strips bbcode tags from a string.
bbcode -- A string to remove tags from
"""
return u"".join([t[1] for t in PostMarkup.tokenize(bbcode) if t[0] == PostMarkup.TOKEN_TEXT])
def create(include=None, exclude=None, use_pygments=True, **kwargs):
"""Create a postmarkup object that converts bbcode to XML snippets. Note
that creating postmarkup objects is _not_ threadsafe, but rendering the
html _is_ threadsafe. So typically you will need just one postmarkup instance
to render the bbcode accross threads.
include -- List or similar iterable containing the names of the tags to use
If omitted, all tags will be used
exclude -- List or similar iterable containing the names of the tags to exclude.
If omitted, no tags will be excluded
use_pygments -- If True, Pygments (http://pygments.org/) will be used for the code tag,
otherwise it will use <pre>code</pre>
kwargs -- Remaining keyword arguments are passed to tag constructors.
"""
postmarkup = PostMarkup()
postmarkup_add_tag = postmarkup.tag_factory.add_tag
def add_tag(tag_class, name, *args, **kwargs):
if include is None or name in include:
if exclude is not None and name in exclude:
return
postmarkup_add_tag(tag_class, name, *args, **kwargs)
add_tag(SimpleTag, 'b', 'strong')
add_tag(SimpleTag, 'i', 'em')
add_tag(SimpleTag, 'u', 'u')
add_tag(SimpleTag, 's', 'strike')
add_tag(LinkTag, 'link', **kwargs)
add_tag(LinkTag, 'url', **kwargs)
add_tag(QuoteTag, 'quote')
add_tag(SearchTag, u'wiki',
u"http://en.wikipedia.org/wiki/Special:Search?search=%s", u'wikipedia.com', **kwargs)
add_tag(SearchTag, u'google',
u"http://www.google.com/search?hl=en&q=%s&btnG=Google+Search", u'google.com', **kwargs)
add_tag(SearchTag, u'dictionary',
u"http://dictionary.reference.com/browse/%s", u'dictionary.com', **kwargs)
add_tag(SearchTag, u'dict',
u"http://dictionary.reference.com/browse/%s", u'dictionary.com', **kwargs)
add_tag(ImgTag, u'img')
add_tag(ListTag, u'list')
add_tag(ListItemTag, u'*')
add_tag(SizeTag, u"size")
add_tag(ColorTag, u"color")
add_tag(CenterTag, u"center")
if use_pygments:
assert pygments_available, "Install Pygments (http://pygments.org/) or call create with use_pygments=False"
add_tag(PygmentsCodeTag, u'code', **kwargs)
else:
add_tag(CodeTag, u'code', **kwargs)
add_tag(ParagraphTag, u"p")
return postmarkup
class TagBase(object):
def __init__(self, name, enclosed=False, auto_close=False, inline=False, strip_first_newline=False, **kwargs):
"""Base class for all tags.
name -- The name of the bbcode tag
enclosed -- True if the contents of the tag should not be bbcode processed.
auto_close -- True if the tag is standalone and does not require a close tag.
inline -- True if the tag generates an inline html tag.
"""
self.name = name
self.enclosed = enclosed
self.auto_close = auto_close
self.inline = inline
self.strip_first_newline = strip_first_newline
self.open_pos = None
self.close_pos = None
self.open_node_index = None
self.close_node_index = None
def open(self, parser, params, open_pos, node_index):
""" Called when the open tag is initially encountered. """
self.params = params
self.open_pos = open_pos
self.open_node_index = node_index
def close(self, parser, close_pos, node_index):
""" Called when the close tag is initially encountered. """
self.close_pos = close_pos
self.close_node_index = node_index
def render_open(self, parser, node_index):
""" Called to render the open tag. """
pass
def render_close(self, parser, node_index):
""" Called to render the close tag. """
pass
def get_contents(self, parser):
"""Returns the string between the open and close tag."""
return parser.markup[self.open_pos:self.close_pos]
def get_contents_text(self, parser):
"""Returns the string between the the open and close tag, minus bbcode tags."""
return u"".join( parser.get_text_nodes(self.open_node_index, self.close_node_index) )
def skip_contents(self, parser):
"""Skips the contents of a tag while rendering."""
parser.skip_to_node(self.close_node_index)
def __str__(self):
return '[%s]'%self.name
class SimpleTag(TagBase):
"""A tag that can be rendered with a simple substitution. """
def __init__(self, name, html_name, **kwargs):
""" html_name -- the html tag to substitute."""
TagBase.__init__(self, name, inline=True)
self.html_name = html_name
def render_open(self, parser, node_index):
return u"<%s>"%self.html_name
def render_close(self, parser, node_index):
return u"</%s>"%self.html_name
class DivStyleTag(TagBase):
"""A simple tag that is replaces with a div and a style."""
def __init__(self, name, style, value, **kwargs):
TagBase.__init__(self, name)
self.style = style
self.value = value
def render_open(self, parser, node_index):
return u'<div style="%s:%s;">' % (self.style, self.value)
def render_close(self, parser, node_index):
return u'</div>'
class LinkTag(TagBase):
_safe_chars = frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
'abcdefghijklmnopqrstuvwxyz'
'0123456789'
'_.-=/&?:%&')
_re_domain = re.compile(r"//([a-z0-9-\.]*)", re.UNICODE)
def __init__(self, name, annotate_links=True, **kwargs):
TagBase.__init__(self, name, inline=True)
self.annotate_links = annotate_links
def render_open(self, parser, node_index):
self.domain = u''
tag_data = parser.tag_data
nest_level = tag_data['link_nest_level'] = tag_data.setdefault('link_nest_level', 0) + 1
if nest_level > 1:
return u""
if self.params:
url = self.params.strip()
else:
url = self.get_contents_text(parser).strip()
url = _unescape(url)
self.domain = ""
if u"javascript:" in url.lower():
return ""
if ':' not in url:
url = 'http://' + url
scheme, uri = url.split(':', 1)
if scheme not in ['http', 'https']:
return u''
try:
domain = self._re_domain.search(uri.lower()).group(1)
except IndexError:
return u''
domain = domain.lower()
if domain.startswith('www.'):
domain = domain[4:]
def percent_encode(s):
safe_chars = self._safe_chars
def replace(c):
if c not in safe_chars:
return "%%%02X"%ord(c)
else:
return c
return "".join([replace(c) for c in s])
self.url = percent_encode(url.encode('utf-8', 'replace'))
self.domain = domain
if not self.url:
return u""
if self.domain:
return u'<a href="%s">'%self.url
else:
return u""
def render_close(self, parser, node_index):
tag_data = parser.tag_data
tag_data['link_nest_level'] -= 1
if tag_data['link_nest_level'] > 0:
return u''
if self.domain:
return u'</a>'+self.annotate_link(self.domain)
else:
return u''
def annotate_link(self, domain=None):
if domain and self.annotate_links:
return annotate_link(domain)
else:
return u""
class QuoteTag(TagBase):
def __init__(self, name, **kwargs):
TagBase.__init__(self, name, strip_first_newline=True)
def open(self, parser, *args):
TagBase.open(self, parser, *args)
def close(self, parser, *args):
TagBase.close(self, parser, *args)
def render_open(self, parser, node_index):
if self.params:
return u'<blockquote><em>%s</em><br/>'%(PostMarkup.standard_replace(self.params))
else:
return u'<blockquote>'
def render_close(self, parser, node_index):
return u"</blockquote>"
class SearchTag(TagBase):
def __init__(self, name, url, label="", annotate_links=True, **kwargs):
TagBase.__init__(self, name, inline=True)
self.url = url
self.label = label
self.annotate_links = annotate_links
def render_open(self, parser, node_idex):
if self.params:
search=self.params
else:
search=self.get_contents(parser)
link = u'<a href="%s">' % self.url
if u'%' in link:
return link%quote_plus(search.encode("UTF-8"))
else:
return link
def render_close(self, parser, node_index):
if self.label:
if self.annotate_links:
return u'</a>'+ annotate_link(self.label)
else:
return u'</a>'
else:
return u''
class PygmentsCodeTag(TagBase):
def __init__(self, name, pygments_line_numbers=False, **kwargs):
TagBase.__init__(self, name, enclosed=True, strip_first_newline=True)
self.line_numbers = pygments_line_numbers
def render_open(self, parser, node_index):
contents = self.get_contents(parser)
self.skip_contents(parser)
try:
lexer = get_lexer_by_name(self.params, stripall=True)
except ClassNotFound:
contents = _escape(contents)
return '<div class="code"><pre>%s</pre></div>' % contents
formatter = HtmlFormatter(linenos=self.line_numbers, cssclass="code")
return highlight(contents, lexer, formatter)
class CodeTag(TagBase):
def __init__(self, name, **kwargs):
TagBase.__init__(self, name, enclosed=True, strip_first_newline=True)
def render_open(self, parser, node_index):
contents = _escape_no_breaks(self.get_contents(parser))
self.skip_contents(parser)
return '<div class="code"><pre>%s</pre></div>' % contents
class ImgTag(TagBase):
def __init__(self, name, **kwargs):
TagBase.__init__(self, name, inline=True)
def render_open(self, parser, node_index):
contents = self.get_contents(parser)
self.skip_contents(parser)
contents = strip_bbcode(contents).replace(u'"', "%22")
return u'<img src="%s"></img>' % contents
class ListTag(TagBase):
def __init__(self, name, **kwargs):
TagBase.__init__(self, name, strip_first_newline=True)
def open(self, parser, params, open_pos, node_index):
TagBase.open(self, parser, params, open_pos, node_index)
def close(self, parser, close_pos, node_index):
TagBase.close(self, parser, close_pos, node_index)
def render_open(self, parser, node_index):
self.close_tag = u""
tag_data = parser.tag_data
tag_data.setdefault("ListTag.count", 0)
if tag_data["ListTag.count"]:
return u""
tag_data["ListTag.count"] += 1
tag_data["ListItemTag.initial_item"]=True
if self.params == "1":
self.close_tag = u"</li></ol>"
return u"<ol><li>"
elif self.params == "a":
self.close_tag = u"</li></ol>"
return u'<ol style="list-style-type: lower-alpha;"><li>'
elif self.params == "A":
self.close_tag = u"</li></ol>"
return u'<ol style="list-style-type: upper-alpha;"><li>'
else:
self.close_tag = u"</li></ul>"
return u"<ul><li>"
def render_close(self, parser, node_index):
tag_data = parser.tag_data
tag_data["ListTag.count"] -= 1
return self.close_tag
class ListItemTag(TagBase):
def __init__(self, name, **kwargs):
TagBase.__init__(self, name)
self.closed = False
def render_open(self, parser, node_index):
tag_data = parser.tag_data
if not tag_data.setdefault("ListTag.count", 0):
return u""
if tag_data["ListItemTag.initial_item"]:
tag_data["ListItemTag.initial_item"] = False
return
return u"</li><li>"
class SizeTag(TagBase):
valid_chars = frozenset("0123456789")
def __init__(self, name, **kwargs):
TagBase.__init__(self, name, inline=True)
def render_open(self, parser, node_index):
try:
self.size = int( "".join([c for c in self.params if c in self.valid_chars]) )
except ValueError:
self.size = None
if self.size is None:
return u""
self.size = self.validate_size(self.size)
return u'<span style="font-size:%spx">' % self.size
def render_close(self, parser, node_index):
if self.size is None:
return u""
return u'</span>'
def validate_size(self, size):
size = min(64, size)
size = max(4, size)
return size
class ColorTag(TagBase):
valid_chars = frozenset("#0123456789abcdefghijklmnopqrstuvwxyz")
def __init__(self, name, **kwargs):
TagBase.__init__(self, name, inline=True)
def render_open(self, parser, node_index):
valid_chars = self.valid_chars
color = self.params.split()[0:1][0].lower()
self.color = "".join([c for c in color if c in valid_chars])
if not self.color:
return u""
return u'<span style="color:%s">' % self.color
def render_close(self, parser, node_index):
if not self.color:
return u''
return u'</span>'
class CenterTag(TagBase):
def render_open(self, parser, node_index, **kwargs):
return u'<div style="text-align:center;">'
def render_close(self, parser, node_index):
return u'</div>'
class ParagraphTag(TagBase):
def __init__(self, name, **kwargs):
TagBase.__init__(self, name)
def render_open(self, parser, node_index, **kwargs):
tag_data = parser.tag_data
level = tag_data.setdefault('ParagraphTag.level', 0)
ret = []
if level > 0:
ret.append(u'</p>')
tag_data['ParagraphTag.level'] -= 1;
ret.append(u'<p>')
tag_data['ParagraphTag.level'] += 1;
return u''.join(ret)
def render_close(self, parser, node_index):
tag_data = parser.tag_data
level = tag_data.setdefault('ParagraphTag.level', 0)
if not level:
return u''
tag_data['ParagraphTag.level'] -= 1;
return u'</p>'
class SectionTag(TagBase):
"""A specialised tag that stores its contents in a dictionary. Can be
used to define extra contents areas.
"""
def __init__(self, name, **kwargs):
TagBase.__init__(self, name, enclosed=True)
def render_open(self, parser, node_index):
self.section_name = self.params.strip().lower().replace(u' ', u'_')
contents = self.get_contents(parser)
self.skip_contents(parser)
tag_data = parser.tag_data
sections = tag_data.setdefault('sections', {})
sections.setdefault(self.section_name, []).append(contents)
return u''
# http://effbot.org/zone/python-replace.htm
class MultiReplace:
def __init__(self, repl_dict):
# string to string mapping; use a regular expression
keys = repl_dict.keys()
keys.sort(reverse=True) # lexical order
pattern = u"|".join([re.escape(key) for key in keys])
self.pattern = re.compile(pattern)
self.dict = repl_dict
def replace(self, s):
# apply replacement dictionary to string
def repl(match, get=self.dict.get):
item = match.group(0)
return get(item, item)
return self.pattern.sub(repl, s)
__call__ = replace
def _escape(s):
return PostMarkup.standard_replace(s.rstrip('\n'))
def _escape_no_breaks(s):
return PostMarkup.standard_replace_no_break(s.rstrip('\n'))
def _unescape(s):
return PostMarkup.standard_unreplace(s)
class TagFactory(object):
def __init__(self):
self.tags = {}
@classmethod
def tag_factory_callable(cls, tag_class, name, *args, **kwargs):
"""
Returns a callable that returns a new tag instance.
"""
def make():
return tag_class(name, *args, **kwargs)
return make
def add_tag(self, cls, name, *args, **kwargs):
self.tags[name] = self.tag_factory_callable(cls, name, *args, **kwargs)
def __getitem__(self, name):
return self.tags[name]()
def __contains__(self, name):
return name in self.tags
def get(self, name, default=None):
if name in self.tags:
return self.tags[name]()
return default
class _Parser(object):
""" This is an interface to the parser, used by Tag classes. """
def __init__(self, post_markup, tag_data=None):
self.pm = post_markup
if tag_data is None:
self.tag_data = {}
else:
self.tag_data = tag_data
self.render_node_index = 0
def skip_to_node(self, node_index):
""" Skips to a node, ignoring intermediate nodes. """
assert node_index is not None, "Node index must be non-None"
self.render_node_index = node_index
def get_text_nodes(self, node1, node2):
""" Retrieves the text nodes between two node indices. """
if node2 is None:
node2 = node1+1
return [node for node in self.nodes[node1:node2] if not callable(node)]
def begin_no_breaks(self):
"""Disables replacing of newlines with break tags at the start and end of text nodes.
Can only be called from a tags 'open' method.
"""
assert self.phase==1, "Can not be called from render_open or render_close"
self.no_breaks_count += 1
def end_no_breaks(self):
"""Re-enables auto-replacing of newlines with break tags (see begin_no_breaks)."""
assert self.phase==1, "Can not be called from render_open or render_close"
if self.no_breaks_count:
self.no_breaks_count -= 1
class PostMarkup(object):
standard_replace = MultiReplace({ u'<':u'&lt;',
u'>':u'&gt;',
u'&':u'&amp;',
u'\n':u'<br/>'})
standard_unreplace = MultiReplace({ u'&lt;':u'<',
u'&gt;':u'>',
u'&amp;':u'&'})
standard_replace_no_break = MultiReplace({ u'<':u'&lt;',
u'>':u'&gt;',
u'&':u'&amp;',})
TOKEN_TAG, TOKEN_PTAG, TOKEN_TEXT = range(3)
_re_end_eq = re.compile(u"\]|\=", re.UNICODE)
_re_quote_end = re.compile(u'\"|\]', re.UNICODE)
# I tried to use RE's. Really I did.
@classmethod
def tokenize(cls, post):
re_end_eq = cls._re_end_eq
re_quote_end = cls._re_quote_end
text = True
pos = 0
def find_first(post, pos, re_ff):
try:
return re_ff.search(post, pos).start()
except AttributeError:
return -1
TOKEN_TAG, TOKEN_PTAG, TOKEN_TEXT = range(3)
post_find = post.find
while True:
brace_pos = post_find(u'[', pos)
if brace_pos == -1:
if pos<len(post):
yield TOKEN_TEXT, post[pos:], pos, len(post)
return
if brace_pos - pos > 0:
yield TOKEN_TEXT, post[pos:brace_pos], pos, brace_pos
pos = brace_pos
end_pos = pos+1
open_tag_pos = post_find(u'[', end_pos)
end_pos = find_first(post, end_pos, re_end_eq)
if end_pos == -1:
yield TOKEN_TEXT, post[pos:], pos, len(post)
return
if open_tag_pos != -1 and open_tag_pos < end_pos:
yield TOKEN_TEXT, post[pos:open_tag_pos], pos, open_tag_pos
end_pos = open_tag_pos
pos = end_pos
continue
if post[end_pos] == ']':
yield TOKEN_TAG, post[pos:end_pos+1], pos, end_pos+1
pos = end_pos+1
continue
if post[end_pos] == '=':
try:
end_pos += 1
while post[end_pos] == ' ':
end_pos += 1
if post[end_pos] != '"':
end_pos = post_find(u']', end_pos+1)
if end_pos == -1:
return
yield TOKEN_TAG, post[pos:end_pos+1], pos, end_pos+1
else:
end_pos = find_first(post, end_pos, re_quote_end)
if end_pos==-1:
return
if post[end_pos] == '"':
end_pos = post_find(u'"', end_pos+1)
if end_pos == -1:
return
end_pos = post_find(u']', end_pos+1)
if end_pos == -1:
return
yield TOKEN_PTAG, post[pos:end_pos+1], pos, end_pos+1
else:
yield TOKEN_TAG, post[pos:end_pos+1], pos, end_pos
pos = end_pos+1
except IndexError:
return
def add_tag(self, cls, name, *args, **kwargs):
return self.tag_factory.add_tag(cls, name, *args, **kwargs)
def tagify_urls(self, postmarkup ):
""" Surrounds urls with url bbcode tags. """
def repl(match):
return u'[url]%s[/url]' % match.group(0)
text_tokens = []
TOKEN_TEXT = PostMarkup.TOKEN_TEXT
for tag_type, tag_token, start_pos, end_pos in self.tokenize(postmarkup):
if tag_type == TOKEN_TEXT:
text_tokens.append(_re_url.sub(repl, tag_token))
else:
text_tokens.append(tag_token)
return u"".join(text_tokens)
def __init__(self, tag_factory=None):
self.tag_factory = tag_factory or TagFactory()
def default_tags(self):
""" Add some basic tags. """
add_tag = self.tag_factory.add_tag
add_tag(SimpleTag, u'b', u'strong')
add_tag(SimpleTag, u'i', u'em')
add_tag(SimpleTag, u'u', u'u')
add_tag(SimpleTag, u's', u's')
def get_supported_tags(self):
""" Returns a list of the supported tags. """
return sorted(self.tag_factory.tags.keys())
def insert_paragraphs(self, post_markup):
"""Inserts paragraph tags in place of newlines. A more complex task than
it may seem -- Multiple newlines result in just one paragraph tag, and
paragraph tags aren't inserted inside certain other tags (such as the
code tag). Returns a postmarkup string.
post_markup -- A string containing the raw postmarkup
"""
parts = [u'[p]']
tag_factory = self.tag_factory
enclosed_count = 0
TOKEN_TEXT = PostMarkup.TOKEN_TEXT
TOKEN_TAG = PostMarkup.TOKEN_TAG
for tag_type, tag_token, start_pos, end_pos in self.tokenize(post_markup):
if tag_type == TOKEN_TEXT:
if enclosed_count:
parts.append(post_markup[start_pos:end_pos])
else:
txt = post_markup[start_pos:end_pos]
txt = _re_break_groups.sub(u'[p]', txt)
parts.append(txt)
continue
elif tag_type == TOKEN_TAG:
tag_token = tag_token[1:-1].lstrip()
if ' ' in tag_token:
tag_name = tag_token.split(u' ', 1)[0]
else:
if '=' in tag_token:
tag_name = tag_token.split(u'=', 1)[0]
else:
tag_name = tag_token
else:
tag_token = tag_token[1:-1].lstrip()
tag_name = tag_token.split(u'=', 1)[0]
tag_name = tag_name.strip().lower()
end_tag = False
if tag_name.startswith(u'/'):
end_tag = True
tag_name = tag_name[1:]
tag = tag_factory.get(tag_name, None)
if tag is not None and tag.enclosed:
if end_tag:
enclosed_count -= 1
else:
enclosed_count += 1
parts.append(post_markup[start_pos:end_pos])
new_markup = u"".join(parts)
return new_markup
# Matches simple blank tags containing only whitespace
_re_blank_tags = re.compile(r"\<(\w+?)\>\s*\</\1\>")
@classmethod
def cleanup_html(cls, html):
"""Cleans up html. Currently only removes blank tags, i.e. tags containing only
whitespace. Only applies to tags without attributes. Tag removal is done
recursively until there are no more blank tags. So <strong><em></em></strong>
would be completely removed.
html -- A string containing (X)HTML
"""
original_html = ''
while original_html != html:
original_html = html
html = cls._re_blank_tags.sub(u"", html)
return html
def render_to_html(self,
post_markup,
encoding="ascii",
exclude_tags=None,
auto_urls=True,
paragraphs=False,
clean=True,
tag_data=None):
"""Converts post markup (ie. bbcode) to XHTML. This method is threadsafe,
buy virtue that the state is entirely stored on the stack.
post_markup -- String containing bbcode.
encoding -- Encoding of string, defaults to "ascii" if the string is not
already unicode.
exclude_tags -- A collection of tag names to ignore.
auto_urls -- If True, then urls will be wrapped with url bbcode tags.
paragraphs -- If True then line breaks will be replaced with paragraph
tags, rather than break tags.
clean -- If True, html will be run through the cleanup_html method.
tag_data -- An optional dictionary to store tag data in. The default of
None will create a dictionary internaly. Set this to your own dictionary
if you want to retrieve information from the Tag Classes.
"""
if not isinstance(post_markup, unicode):
post_markup = unicode(post_markup, encoding, 'replace')
if auto_urls:
post_markup = self.tagify_urls(post_markup)
if paragraphs:
post_markup = self.insert_paragraphs(post_markup)
parser = _Parser(self, tag_data=tag_data)
parser.markup = post_markup
if exclude_tags is None:
exclude_tags = []
tag_factory = self.tag_factory
nodes = []
parser.nodes = nodes
parser.phase = 1
parser.no_breaks_count = 0
enclosed_count = 0
open_stack = []
tag_stack = []
break_stack = []
remove_next_newline = False
def check_tag_stack(tag_name):
for tag in reversed(tag_stack):
if tag_name == tag.name:
return True
return False
def redo_break_stack():
while break_stack:
tag = break_stack.pop()
open_tag(tag)
tag_stack.append(tag)
def break_inline_tags():
while tag_stack:
if tag_stack[-1].inline:
tag = tag_stack.pop()
close_tag(tag)
break_stack.append(tag)
else:
break
def open_tag(tag):
def call(node_index):
return tag.render_open(parser, node_index)
nodes.append(call)
def close_tag(tag):
def call(node_index):
return tag.render_close(parser, node_index)
nodes.append(call)
TOKEN_TEXT = PostMarkup.TOKEN_TEXT
TOKEN_TAG = PostMarkup.TOKEN_TAG
# Pass 1
for tag_type, tag_token, start_pos, end_pos in self.tokenize(post_markup):
raw_tag_token = tag_token
if tag_type == TOKEN_TEXT:
if parser.no_breaks_count:
tag_token = tag_token.strip()
if not tag_token:
continue
if remove_next_newline:
tag_token = tag_token.lstrip(' ')
if tag_token.startswith('\n'):
tag_token = tag_token.lstrip(' ')[1:]
if not tag_token:
continue
remove_next_newline = False
if tag_stack and tag_stack[-1].strip_first_newline:
tag_token = tag_token.lstrip()
tag_stack[-1].strip_first_newline = False
if not tag_stack[-1]:
tag_stack.pop()
continue
if not enclosed_count:
redo_break_stack()
nodes.append(self.standard_replace(tag_token))
continue
elif tag_type == TOKEN_TAG:
tag_token = tag_token[1:-1].lstrip()
if ' ' in tag_token:
tag_name, tag_attribs = tag_token.split(u' ', 1)
tag_attribs = tag_attribs.strip()
else:
if '=' in tag_token:
tag_name, tag_attribs = tag_token.split(u'=', 1)
tag_attribs = tag_attribs.strip()
else:
tag_name = tag_token
tag_attribs = u""
else:
tag_token = tag_token[1:-1].lstrip()
tag_name, tag_attribs = tag_token.split(u'=', 1)
tag_attribs = tag_attribs.strip()[1:-1]
tag_name = tag_name.strip().lower()
end_tag = False
if tag_name.startswith(u'/'):
end_tag = True
tag_name = tag_name[1:]
if enclosed_count and tag_stack[-1].name != tag_name:
continue
if tag_name in exclude_tags:
continue
if not end_tag:
tag = tag_factory.get(tag_name, None)
if tag is None:
continue
redo_break_stack()
if not tag.inline:
break_inline_tags()
tag.open(parser, tag_attribs, end_pos, len(nodes))
if tag.enclosed:
enclosed_count += 1
tag_stack.append(tag)
open_tag(tag)
if tag.auto_close:
tag = tag_stack.pop()
tag.close(self, start_pos, len(nodes)-1)
close_tag(tag)
else:
if break_stack and break_stack[-1].name == tag_name:
break_stack.pop()
tag.close(parser, start_pos, len(nodes))
elif check_tag_stack(tag_name):
while tag_stack[-1].name != tag_name:
tag = tag_stack.pop()
break_stack.append(tag)
close_tag(tag)
tag = tag_stack.pop()
tag.close(parser, start_pos, len(nodes))
if tag.enclosed:
enclosed_count -= 1
close_tag(tag)
if not tag.inline:
remove_next_newline = True
if tag_stack:
redo_break_stack()
while tag_stack:
tag = tag_stack.pop()
tag.close(parser, len(post_markup), len(nodes))
if tag.enclosed:
enclosed_count -= 1
close_tag(tag)
parser.phase = 2
# Pass 2
parser.nodes = nodes
text = []
parser.render_node_index = 0
while parser.render_node_index < len(parser.nodes):
i = parser.render_node_index
node_text = parser.nodes[i]
if callable(node_text):
node_text = node_text(i)
if node_text is not None:
text.append(node_text)
parser.render_node_index += 1
html = u"".join(text)
if clean:
html = self.cleanup_html(html)
return html
# A shortcut for render_to_html
__call__ = render_to_html
_postmarkup = create(use_pygments=pygments_available)
def render_bbcode(bbcode,
encoding="ascii",
exclude_tags=None,
auto_urls=True,
paragraphs=False,
clean=True,
tag_data=None):
""" Renders a bbcode string in to XHTML. This is a shortcut if you don't
need to customize any tags.
post_markup -- String containing bbcode.
encoding -- Encoding of string, defaults to "ascii" if the string is not
already unicode.
exclude_tags -- A collection of tag names to ignore.
auto_urls -- If True, then urls will be wrapped with url bbcode tags.
paragraphs -- If True then line breaks will be replaces with paragraph
tags, rather than break tags.
clean -- If True, html will be run through a cleanup_html method.
tag_data -- An optional dictionary to store tag data in. The default of
None will create a dictionary internally.
"""
return _postmarkup(bbcode,
encoding,
exclude_tags=exclude_tags,
auto_urls=auto_urls,
paragraphs=paragraphs,
clean=clean,
tag_data=tag_data)
def _tests():
import sys
#sys.stdout=open('test.htm', 'w')
post_markup = create(use_pygments=True)
tests = []
print """<link rel="stylesheet" href="code.css" type="text/css" />\n"""
tests.append(']')
tests.append('[')
tests.append(':-[ Hello, [b]World[/b]')
tests.append("[link=http://www.willmcgugan.com]My homepage[/link]")
tests.append('[link="http://www.willmcgugan.com"]My homepage[/link]')
tests.append("[link http://www.willmcgugan.com]My homepage[/link]")
tests.append("[link]http://www.willmcgugan.com[/link]")
tests.append(u"[b]Hello André[/b]")
tests.append(u"[google]André[/google]")
tests.append("[s]Strike through[/s]")
tests.append("[b]bold [i]bold and italic[/b] italic[/i]")
tests.append("[google]Will McGugan[/google]")
tests.append("[wiki Will McGugan]Look up my name in Wikipedia[/wiki]")
tests.append("[quote Will said...]BBCode is very cool[/quote]")
tests.append("""[code python]
# A proxy object that calls a callback when converted to a string
class TagStringify(object):
def __init__(self, callback, raw):
self.callback = callback
self.raw = raw
r[b]=3
def __str__(self):
return self.callback()
def __repr__(self):
return self.__str__()
[/code]""")
tests.append(u"[img]http://upload.wikimedia.org/wikipedia/commons"\
"/6/61/Triops_longicaudatus.jpg[/img]")
tests.append("[list][*]Apples[*]Oranges[*]Pears[/list]")
tests.append("""[list=1]
[*]Apples
[*]Oranges
are not the only fruit
[*]Pears
[/list]""")
tests.append("[list=a][*]Apples[*]Oranges[*]Pears[/list]")
tests.append("[list=A][*]Apples[*]Oranges[*]Pears[/list]")
long_test="""[b]Long test[/b]
New lines characters are converted to breaks."""\
"""Tags my be [b]ove[i]rl[/b]apped[/i].
[i]Open tags will be closed.
[b]Test[/b]"""
tests.append(long_test)
tests.append("[dict]Will[/dict]")
tests.append("[code unknownlanguage]10 print 'In yr code'; 20 goto 10[/code]")
tests.append("[url=http://www.google.com/coop/cse?cx=006850030468302103399%3Amqxv78bdfdo]CakePHP Google Groups[/url]")
tests.append("[url=http://www.google.com/search?hl=en&safe=off&client=opera&rls=en&hs=pO1&q=python+bbcode&btnG=Search]Search for Python BBCode[/url]")
#tests = []
# Attempt to inject html in to unicode
tests.append("[url=http://www.test.com/sfsdfsdf/ter?t=\"></a><h1>HACK</h1><a>\"]Test Hack[/url]")
tests.append('Nested urls, i.e. [url][url]www.becontrary.com[/url][/url], are condensed in to a single tag.')
tests.append(u'[google]ɸβfvθðsz[/google]')
tests.append(u'[size 30]Hello, World![/size]')
tests.append(u'[color red]This should be red[/color]')
tests.append(u'[color #0f0]This should be green[/color]')
tests.append(u"[center]This should be in the center!")
tests.append('Nested urls, i.e. [url][url]www.becontrary.com[/url][/url], are condensed in to a single tag.')
#tests = []
tests.append('[b]Hello, [i]World[/b]! [/i]')
tests.append('[b][center]This should be centered![/center][/b]')
tests.append('[list][*]Hello[i][*]World![/i][/list]')
tests.append("""[list=1]
[*]Apples
[*]Oranges
are not the only fruit
[*]Pears
[/list]""")
tests.append("[b]urls such as http://www.willmcgugan.com are authomaticaly converted to links[/b]")
tests.append("""
[b]
[code python]
parser.markup[self.open_pos:self.close_pos]
[/code]
asdasdasdasdqweqwe
""")
tests.append("""[list 1]
[*]Hello
[*]World
[/list]""")
#tests = []
tests.append("[b][p]Hello, [p]World")
tests.append("[p][p][p]")
tests.append("http://www.google.com/search?as_q=bbcode&btnG=%D0%9F%D0%BE%D0%B8%D1%81%D0%BA")
#tests=["""[b]b[i]i[/b][/i]"""]
for test in tests:
print u"<pre>%s</pre>"%str(test.encode("ascii", "xmlcharrefreplace"))
print u"<p>%s</p>"%str(post_markup(test).encode("ascii", "xmlcharrefreplace"))
print u"<hr/>"
print
#print repr(post_markup('[url=<script>Attack</script>]Attack[/url]'))
#print repr(post_markup('http://www.google.com/search?as_q=%D0%9F%D0%BE%D0%B8%D1%81%D0%BA&test=hai'))
#p = create(use_pygments=False)
#print (p('[code]foo\nbar[/code]'))
#print render_bbcode("[b]For the lazy, use the http://www.willmcgugan.com render_bbcode function.[/b]")
smarkup = create()
smarkup.add_tag(SectionTag, 'section')
test = """Hello, World.[b][i]This in italics
[section sidebar]This is the [b]sidebar[/b][/section]
[section footer]
This is the footer
[/section]
More text"""
print smarkup(test, paragraphs=True, clean=False)
tag_data = {}
print smarkup(test, tag_data=tag_data, paragraphs=True, clean=True)
print tag_data
def _run_unittests():
# TODO: Expand tests for better coverage!
import unittest
class TestPostmarkup(unittest.TestCase):
def testcleanuphtml(self):
postmarkup = create()
tests = [("""\n<p>\n </p>\n""", ""),
("""<b>\n\n<i> </i>\n</b>Test""", "Test"),
("""<p id="test">Test</p>""", """<p id="test">Test</p>"""),]
for test, result in tests:
self.assertEqual(PostMarkup.cleanup_html(test).strip(), result)
def testsimpletag(self):
postmarkup = create()
tests= [ ('[b]Hello[/b]', "<strong>Hello</strong>"),
('[i]Italic[/i]', "<em>Italic</em>"),
('[s]Strike[/s]', "<strike>Strike</strike>"),
('[u]underlined[/u]', "<u>underlined</u>"),
]
for test, result in tests:
self.assertEqual(postmarkup(test), result)
def testoverlap(self):
postmarkup = create()
tests= [ ('[i][b]Hello[/i][/b]', "<em><strong>Hello</strong></em>"),
('[b]bold [u]both[/b] underline[/u]', '<strong>bold <u>both</u></strong><u> underline</u>')
]
for test, result in tests:
self.assertEqual(postmarkup(test), result)
def testlinks(self):
postmarkup = create(annotate_links=False)
tests= [ ('[link=http://www.willmcgugan.com]blog1[/link]', '<a href="http://www.willmcgugan.com">blog1</a>'),
('[link="http://www.willmcgugan.com"]blog2[/link]', '<a href="http://www.willmcgugan.com">blog2</a>'),
('[link http://www.willmcgugan.com]blog3[/link]', '<a href="http://www.willmcgugan.com">blog3</a>'),
('[link]http://www.willmcgugan.com[/link]', '<a href="http://www.willmcgugan.com">http://www.willmcgugan.com</a>')
]
for test, result in tests:
self.assertEqual(postmarkup(test), result)
suite = unittest.TestLoader().loadTestsFromTestCase(TestPostmarkup)
unittest.TextTestRunner(verbosity=2).run(suite)
def _ff_test():
def ff1(post, pos, c1, c2):
f1 = post.find(c1, pos)
f2 = post.find(c2, pos)
if f1 == -1:
return f2
if f2 == -1:
return f1
return min(f1, f2)
re_ff=re.compile('a|b', re.UNICODE)
def ff2(post, pos, c1, c2):
try:
return re_ff.search(post).group(0)
except AttributeError:
return -1
text = u"sdl;fk;sdlfks;dflksd;flksdfsdfwerwerwgwegwegwegwegwegegwweggewwegwegwegwettttttttttttttttttttttttttttttttttgggggggggg;slbdfkwelrkwelrkjal;sdfksdl;fksdf;lb"
REPEAT = 100000
from time import time
start = time()
for n in xrange(REPEAT):
ff1(text, 0, "a", "b")
end = time()
print end - start
start = time()
for n in xrange(REPEAT):
ff2(text, 0, "a", "b")
end = time()
print end - start
if __name__ == "__main__":
_tests()
_run_unittests()