FIX: properly unescape HTML entities in excerpts

2025-04-07 04:24:35 -04:00 · 2014-12-10 12:52:51 +01:00 · 2014-12-10 12:52:51 +01:00 · 6027073547
commit 6027073547
parent 7e609e1834
2 changed files with 8 additions and 6 deletions
--- a/lib/excerpt_parser.rb
+++ b/lib/excerpt_parser.rb
@ -17,16 +17,15 @@ class ExcerptParser < Nokogiri::XML::SAX::Document

  def self.get_excerpt(html, length, options)
    html ||= ''
-    if (html.include? 'excerpt') && (SPAN_REGEX === html)
-      length = html.length
-    end
+    length = html.length if html.include?('excerpt') && SPAN_REGEX === html
    me = self.new(length, options)
    parser = Nokogiri::HTML::SAX::Parser.new(me)
    catch(:done) do
      parser.parse(html)
    end
-    me.excerpt.strip!
-    me.excerpt
+    excerpt = me.excerpt.strip
+    excerpt = CGI.unescapeHTML(excerpt) if options[:text_entities] == true
+    excerpt
  end

  def escape_attribute(v)
--- a/spec/components/pretty_text_spec.rb
+++ b/spec/components/pretty_text_spec.rb
@ -84,7 +84,6 @@ describe PrettyText do
  describe "Excerpt" do

    it "sanitizes attempts to inject invalid attributes" do
-
      spinner = "<a href=\"http://thedailywtf.com/\" data-bbcode=\"' class='fa fa-spin\">WTF</a>"
      PrettyText.excerpt(spinner, 20).should match_html spinner

@ -216,6 +215,10 @@ describe PrettyText do
      post.excerpt.should == two_hundred
    end

+    it "unescapes html entities when we want text entities" do
+      PrettyText.excerpt("&#39;", 500, text_entities: true).should == "'"
+    end
+
  end

  describe "strip links" do