From ffb8cb8caca2f0b73c480b96a2b753ef5866a91e Mon Sep 17 00:00:00 2001 From: Sam Date: Thu, 24 Sep 2015 13:37:53 +1000 Subject: [PATCH] FEATURE: remove dependency of Redcarpet PERF: cache fancy_title in topics table New pure ruby implementation is far more flexible and easier to amend. --- Gemfile | 1 - Gemfile.lock | 2 - app/models/topic.rb | 30 +- ...20150924022040_add_fancy_title_to_topic.rb | 5 + lib/html_prettify.rb | 407 ++++++++++++++++++ spec/components/html_prettify_spec.rb | 30 ++ spec/models/topic_spec.rb | 21 +- 7 files changed, 483 insertions(+), 13 deletions(-) create mode 100644 db/migrate/20150924022040_add_fancy_title_to_topic.rb create mode 100644 lib/html_prettify.rb create mode 100644 spec/components/html_prettify_spec.rb diff --git a/Gemfile b/Gemfile index 1476657d6..be623dde7 100644 --- a/Gemfile +++ b/Gemfile @@ -37,7 +37,6 @@ gem 'babel-transpiler' gem 'message_bus' gem 'rails_multisite', path: 'vendor/gems/rails_multisite' -gem 'redcarpet', require: false gem 'fast_xs' gem 'fast_xor' diff --git a/Gemfile.lock b/Gemfile.lock index dad42251f..d96defd9c 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -320,7 +320,6 @@ GEM ffi (>= 1.0.6) msgpack (>= 0.4.3) trollop (>= 1.16.2) - redcarpet (3.3.2) redis (3.2.1) redis-namespace (1.5.2) redis (~> 3.0, >= 3.0.4) @@ -510,7 +509,6 @@ DEPENDENCIES rb-fsevent rb-inotify (~> 0.9) rbtrace - redcarpet redis rest-client rinku diff --git a/app/models/topic.rb b/app/models/topic.rb index 71111defc..12be5183e 100644 --- a/app/models/topic.rb +++ b/app/models/topic.rb @@ -5,6 +5,7 @@ require_dependency 'rate_limiter' require_dependency 'text_sentinel' require_dependency 'text_cleaner' require_dependency 'archetype' +require_dependency 'html_prettify' class Topic < ActiveRecord::Base include ActionView::Helpers::SanitizeHelper @@ -164,6 +165,9 @@ class Topic < ActiveRecord::Base cancel_auto_close_job ensure_topic_has_a_category end + if title_changed? + write_attribute :fancy_title, Topic.fancy_title(title) + end end after_save do @@ -267,17 +271,28 @@ class Topic < ActiveRecord::Base apply_per_day_rate_limit_for("pms", :max_private_messages_per_day) end + def self.fancy_title(title) + escaped = ERB::Util.html_escape(title) + return unless escaped + HtmlPrettify.render(escaped) + end + def fancy_title - sanitized_title = ERB::Util.html_escape(title) + return ERB::Util.html_escape(title) unless SiteSetting.title_fancy_entities? - return unless sanitized_title - return sanitized_title unless SiteSetting.title_fancy_entities? + unless fancy_title = read_attribute(:fancy_title) - # We don't always have to require this, if fancy is disabled - # see: http://meta.discourse.org/t/pattern-for-defer-loading-gems-and-profiling-with-perftools-rb/4629 - require 'redcarpet' unless defined? Redcarpet + fancy_title = Topic.fancy_title(title) + write_attribute(:fancy_title, fancy_title) - Redcarpet::Render::SmartyPants.render(sanitized_title) + unless new_record? + # make sure data is set in table, this also allows us to change algorithm + # by simply nulling this column + exec_sql("UPDATE topics SET fancy_title = :fancy_title where id = :id", id: self.id, fancy_title: fancy_title) + end + end + + fancy_title end def pending_posts_count @@ -698,6 +713,7 @@ class Topic < ActiveRecord::Base def title=(t) slug = Slug.for(t.to_s) write_attribute(:slug, slug) + write_attribute(:fancy_title, nil) write_attribute(:title,t) end diff --git a/db/migrate/20150924022040_add_fancy_title_to_topic.rb b/db/migrate/20150924022040_add_fancy_title_to_topic.rb new file mode 100644 index 000000000..49863f89f --- /dev/null +++ b/db/migrate/20150924022040_add_fancy_title_to_topic.rb @@ -0,0 +1,5 @@ +class AddFancyTitleToTopic < ActiveRecord::Migration + def change + add_column :topics, :fancy_title, :string, limit: 400, null: true + end +end diff --git a/lib/html_prettify.rb b/lib/html_prettify.rb new file mode 100644 index 000000000..6d0b3d065 --- /dev/null +++ b/lib/html_prettify.rb @@ -0,0 +1,407 @@ +# heavily based off +# https://github.com/vmg/redcarpet/blob/master/ext/redcarpet/html_smartypants.c +# and +# https://github.com/jmcnevin/rubypants/blob/master/lib/rubypants/core.rb +# 99% of the code here is by Jeremy McNevin +# +# This Source File is available under BSD/MIT license as well as standard GPL +# + +class HtmlPrettify < String + def self.render(html) + new(html).to_html + end + + # Create a new RubyPants instance with the text in +string+. + # + # Allowed elements in the options array: + # + # 0 :: do nothing + # 1 :: enable all, using only em-dash shortcuts + # 2 :: enable all, using old school en- and em-dash shortcuts (*default*) + # 3 :: enable all, using inverted old school en and em-dash shortcuts + # -1 :: stupefy (translate HTML entities to their ASCII-counterparts) + # + # If you don't like any of these defaults, you can pass symbols to change + # RubyPants' behavior: + # + # :quotes :: quotes + # :backticks :: backtick quotes (``double'' only) + # :allbackticks :: backtick quotes (``double'' and `single') + # :dashes :: dashes + # :oldschool :: old school dashes + # :inverted :: inverted old school dashes + # :ellipses :: ellipses + # :convertquotes :: convert " entities to + # " + # :stupefy :: translate RubyPants HTML entities + # to their ASCII counterparts. + # + # In addition, you can customize the HTML entities that will be injected by + # passing in a hash for the final argument. The defaults for these entities + # are as follows: + # + # :single_left_quote :: + # :double_left_quote :: + # :single_right_quote :: + # :double_right_quote :: + # :em_dash :: + # :en_dash :: + # :ellipsis :: + # :html_quote :: " + # + def initialize(string, options=[2], entities = {}) + super string + + @options = [*options] + @entities = default_entities.update(entities) + end + + # Apply SmartyPants transformations. + def to_html + do_quotes = do_backticks = do_dashes = do_ellipses = nil + + if @options.include?(0) + # Do nothing. + return self + elsif @options.include?(1) + # Do everything, turn all options on. + do_quotes = do_backticks = do_ellipses = true + do_dashes = :normal + elsif @options.include?(2) + # Do everything, turn all options on, use old school dash shorthand. + do_quotes = do_backticks = do_ellipses = true + do_dashes = :oldschool + elsif @options.include?(3) + # Do everything, turn all options on, use inverted old school + # dash shorthand. + do_quotes = do_backticks = do_ellipses = true + do_dashes = :inverted + elsif @options.include?(-1) + do_stupefy = true + else + do_quotes = @options.include?(:quotes) + do_backticks = @options.include?(:backticks) + do_backticks = :both if @options.include?(:allbackticks) + do_dashes = :normal if @options.include?(:dashes) + do_dashes = :oldschool if @options.include?(:oldschool) + do_dashes = :inverted if @options.include?(:inverted) + do_ellipses = @options.include?(:ellipses) + do_stupefy = @options.include?(:stupefy) + end + + # Parse the HTML + tokens = tokenize + + # Keep track of when we're inside
 or  tags.
+    in_pre = false
+
+    # Here is the result stored in.
+    result = ""
+
+    # This is a cheat, used to get some context for one-character
+    # tokens that consist of just a quote char. What we do is remember
+    # the last character of the previous text token, to use as context
+    # to curl single- character quote tokens correctly.
+    prev_token_last_char = nil
+
+    tokens.each do |token|
+      if token.first == :tag
+        result << token[1]
+        if token[1] =~ %r!<(/?)(?:pre|code|kbd|script|math)[\s>]!
+          in_pre = ($1 != "/")  # Opening or closing tag?
+        end
+      else
+        t = token[1]
+
+        # Remember last char of this token before processing.
+        last_char = t[-1].chr
+
+        unless in_pre
+
+          t.gsub!("'", "'")
+
+          t = process_escapes t
+
+          t.gsub!(""", '"')
+
+          if do_dashes
+            t = educate_dashes t            if do_dashes == :normal
+            t = educate_dashes_oldschool t  if do_dashes == :oldschool
+            t = educate_dashes_inverted t   if do_dashes == :inverted
+          end
+
+          t = educate_ellipses t  if do_ellipses
+
+          t = educate_fractions t
+
+          # Note: backticks need to be processed before quotes.
+          if do_backticks
+            t = educate_backticks t
+            t = educate_single_backticks t  if do_backticks == :both
+          end
+
+          if do_quotes
+            if t == "'"
+              # Special case: single-character ' token
+              if prev_token_last_char =~ /\S/
+                t = entity(:single_right_quote)
+              else
+                t = entity(:single_left_quote)
+              end
+            elsif t == '"'
+              # Special case: single-character " token
+              if prev_token_last_char =~ /\S/
+                t = entity(:double_right_quote)
+              else
+                t = entity(:double_left_quote)
+              end
+            else
+              # Normal case:
+              t = educate_quotes t
+            end
+          end
+
+          t = stupefy_entities t  if do_stupefy
+        end
+
+        prev_token_last_char = last_char
+        result << t
+      end
+    end
+
+    # Done
+    result
+  end
+
+  protected
+
+  # Return the string, with after processing the following backslash
+  # escape sequences. This is useful if you want to force a "dumb" quote
+  # or other character to appear.
+  #
+  # Escaped are:
+  #      \\    \"    \'    \.    \-    \`
+  #
+  def process_escapes(str)
+    str = str.gsub('\\\\', '\')
+    str.gsub!('\"',   '"')
+    str.gsub!("\\\'", ''')
+    str.gsub!('\.',   '.')
+    str.gsub!('\-',   '-')
+    str.gsub!('\`',   '`')
+    str
+  end
+
+  # The string, with each instance of "--" translated to an
+  # em-dash HTML entity.
+  #
+  def educate_dashes(str)
+    str.
+      gsub(/--/, entity(:em_dash))
+  end
+
+  # The string, with each instance of "--" translated to an
+  # en-dash HTML entity, and each "---" translated to an
+  # em-dash HTML entity.
+  #
+  def educate_dashes_oldschool(str)
+    str.
+      gsub(/---/, entity(:em_dash)).
+      gsub(/--/,  entity(:en_dash))
+  end
+
+  # Return the string, with each instance of "--" translated
+  # to an em-dash HTML entity, and each "---" translated to
+  # an en-dash HTML entity. Two reasons why: First, unlike the en- and
+  # em-dash syntax supported by +educate_dashes_oldschool+, it's
+  # compatible with existing entries written before SmartyPants 1.1,
+  # back when "--" was only used for em-dashes.  Second,
+  # em-dashes are more common than en-dashes, and so it sort of makes
+  # sense that the shortcut should be shorter to type. (Thanks to
+  # Aaron Swartz for the idea.)
+  #
+  def educate_dashes_inverted(str)
+    str.
+      gsub(/---/, entity(:en_dash)).
+      gsub(/--/,  entity(:em_dash))
+  end
+
+  # Return the string, with each instance of "..." translated
+  # to an ellipsis HTML entity. Also converts the case where there are
+  # spaces between the dots.
+  #
+  def educate_ellipses(str)
+    str.
+      gsub('...',   entity(:ellipsis)).
+      gsub('. . .', entity(:ellipsis))
+  end
+
+  # Return the string, with "``backticks''"-style single quotes
+  # translated into HTML curly quote entities.
+  #
+  def educate_backticks(str)
+    str.
+      gsub("``", entity(:double_left_quote)).
+      gsub("''", entity(:double_right_quote))
+  end
+
+  # Return the string, with "`backticks'"-style single quotes
+  # translated into HTML curly quote entities.
+  #
+  def educate_single_backticks(str)
+    str.
+      gsub("`", entity(:single_left_quote)).
+      gsub("'", entity(:single_right_quote))
+  end
+
+  def educate_fractions(str)
+    str.gsub(/(\s+|^)(1\/4|1\/2|3\/4)([,.;\s]|$)/) do
+      frac =
+        if $2 == "1/2".freeze
+          entity(:frac12)
+        elsif $2 == "1/4".freeze
+          entity(:frac14)
+        elsif $2 == "3/4".freeze
+          entity(:frac34)
+        end
+      "#{$1}#{frac}#{$3}"
+    end
+  end
+
+  # Return the string, with "educated" curly quote HTML entities.
+  #
+  def educate_quotes(str)
+    punct_class = '[!"#\$\%\'()*+,\-.\/:;<=>?\@\[\\\\\]\^_`{|}~]'
+
+    # normalize html
+    str = str.dup
+    # Special case if the very first character is a quote followed by
+    # punctuation at a non-word-break. Close the quotes by brute
+    # force:
+    str.gsub!(/^'(?=#{punct_class}\B)/,
+              entity(:single_right_quote))
+    str.gsub!(/^"(?=#{punct_class}\B)/,
+              entity(:double_right_quote))
+
+    # Special case for double sets of quotes, e.g.:
+    #   

He said, "'Quoted' words in a larger quote."

+ str.gsub!(/"'(?=\w)/, + "#{entity(:double_left_quote)}#{entity(:single_left_quote)}") + str.gsub!(/'"(?=\w)/, + "#{entity(:single_left_quote)}#{entity(:double_left_quote)}") + + # Special case for decade abbreviations (the '80s): + str.gsub!(/'(?=\d\ds)/, + entity(:single_right_quote)) + + close_class = %![^\ \t\r\n\\[\{\(\-]! + dec_dashes = "#{entity(:en_dash)}|#{entity(:em_dash)}" + + # Get most opening single quotes: + str.gsub!(/(\s| |=|--|&[mn]dash;|#{dec_dashes}|ȁ[34];)'(?=\w)/, + '\1' + entity(:single_left_quote)) + + # Single closing quotes: + str.gsub!(/(#{close_class})'/, + '\1' + entity(:single_right_quote)) + str.gsub!(/'(\s|s\b|$)/, + entity(:single_right_quote) + '\1') + + # Any remaining single quotes should be opening ones: + str.gsub!(/'/, + entity(:single_left_quote)) + + # Get most opening double quotes: + str.gsub!(/(\s| |=|--|&[mn]dash;|#{dec_dashes}|ȁ[34];)"(?=\w)/, + '\1' + entity(:double_left_quote)) + + # Double closing quotes: + str.gsub!(/(#{close_class})"/, + '\1' + entity(:double_right_quote)) + str.gsub!(/"(\s|s\b|$)/, + entity(:double_right_quote) + '\1') + + # Any remaining quotes should be opening ones: + str.gsub!(/"/, + entity(:double_left_quote)) + + str + end + + # Return the string, with each RubyPants HTML entity translated to + # its ASCII counterpart. + # + # Note: This is not reversible (but exactly the same as in SmartyPants) + # + def stupefy_entities(str) + new_str = str.dup + + { + :en_dash => '-', + :em_dash => '--', + :single_left_quote => "'", + :single_right_quote => "'", + :double_left_quote => '"', + :double_right_quote => '"', + :ellipsis => '...' + }.each do |k,v| + new_str.gsub!(/#{entity(k)}/, v) + end + + new_str + end + + # Return an array of the tokens comprising the string. Each token is + # either a tag (possibly with nested, tags contained therein, such + # as , or a run of text between + # tags. Each element of the array is a two-element array; the first + # is either :tag or :text; the second is the actual value. + # + # Based on the _tokenize() subroutine from Brad Choate's + # MTRegex plugin. + # + # This is actually the easier variant using tag_soup, as used by + # Chad Miller in the Python port of SmartyPants. + # + def tokenize + tag_soup = /([^<]*)(<[^>]*>)/ + + tokens = [] + + prev_end = 0 + + scan(tag_soup) do + tokens << [:text, $1] if $1 != "" + tokens << [:tag, $2] + prev_end = $~.end(0) + end + + if prev_end < size + tokens << [:text, self[prev_end..-1]] + end + + tokens + end + + def default_entities + { + single_left_quote: "‘", + double_left_quote: "“", + single_right_quote: "’", + double_right_quote: "”", + em_dash: "—", + en_dash: "–", + ellipsis: "…", + html_quote: """, + frac12: "½", + frac14: "¼", + frac34: "¾", + } + end + + def entity(key) + @entities[key] + end + +end diff --git a/spec/components/html_prettify_spec.rb b/spec/components/html_prettify_spec.rb new file mode 100644 index 000000000..8f1126eeb --- /dev/null +++ b/spec/components/html_prettify_spec.rb @@ -0,0 +1,30 @@ +require 'spec_helper' +require 'html_prettify' + +describe HtmlPrettify do + + def t(source, expected) + expect(HtmlPrettify.render(source)).to eq(expected) + end + + it 'correctly prettifies html' do + t "

All's well!

", "

All’s well!

" + t "

Eatin' Lunch'.

", "

Eatin’ Lunch’.

" + t "

a 1/4. is a fraction but not 1/4/2000

", "

a ¼. is a fraction but not 1/4/2000

" + t "

Well that'll be the day

", "

Well that’ll be the day

" + t %(

"Quoted text"

), %(

“Quoted text”

) + t "

I've been meaning to tell you ..

", "

I’ve been meaning to tell you ..

" + t "

single `backticks` in HTML should be preserved

", "

single `backticks` in HTML should be preserved

" + t "

double hyphen -- ndash --- mdash

", "

double hyphen – ndash — mdash

" + t "a long time ago...", "a long time ago…" + t "is 'this a mistake'?", "is ‘this a mistake’?" + t ERB::Util.html_escape("'that went well'"), "‘that went well’" + t '"that went well"', "“that went well”" + t ERB::Util.html_escape('"that went well"'), "“that went well”" + + t 'src="test.png"> yay', "src=“test.png”> yay" + + t ERB::Util.html_escape(' yay'), "<img src=“test.png”> yay" + end + +end diff --git a/spec/models/topic_spec.rb b/spec/models/topic_spec.rb index ccaed7b06..3ed9e895d 100644 --- a/spec/models/topic_spec.rb +++ b/spec/models/topic_spec.rb @@ -214,7 +214,7 @@ describe Topic do context 'title_fancy_entities disabled' do before do - SiteSetting.stubs(:title_fancy_entities).returns(false) + SiteSetting.title_fancy_entities = false end it "doesn't add entities to the title" do @@ -224,11 +224,26 @@ describe Topic do context 'title_fancy_entities enabled' do before do - SiteSetting.stubs(:title_fancy_entities).returns(true) + SiteSetting.title_fancy_entities = true end - it "converts the title to have fancy entities" do + it "converts the title to have fancy entities and updates" do expect(topic.fancy_title).to eq("“this topic” – has “fancy stuff”") + topic.title = "this is my test hello world... yay" + topic.user.save! + topic.save! + topic.reload + expect(topic.fancy_title).to eq("This is my test hello world… yay") + + topic.title = "I made a change to the title" + topic.save! + + topic.reload + expect(topic.fancy_title).to eq("I made a change to the title") + + # another edge case + topic.title = "this is another edge case" + expect(topic.fancy_title).to eq("this is another edge case") end end end