From cab3502c154a23ed5def5f9b25f452504ccbf8cb Mon Sep 17 00:00:00 2001 From: Navin Date: Thu, 30 May 2013 20:34:44 +0200 Subject: [PATCH] Replace PostAnalyser module with a class Encapsulate Post#cooked_document as well Include specs for PostAnalyzer class --- .gitignore | 1 + app/models/post.rb | 42 +++----- app/models/post_analyzer.rb | 101 ++++++++++++++++++ spec/models/post_analyzer_spec.rb | 166 ++++++++++++++++++++++++++++++ 4 files changed, 280 insertions(+), 30 deletions(-) create mode 100644 app/models/post_analyzer.rb create mode 100644 spec/models/post_analyzer_spec.rb diff --git a/.gitignore b/.gitignore index 555bef937..4cf8d7c8c 100644 --- a/.gitignore +++ b/.gitignore @@ -35,6 +35,7 @@ config/discourse.pill # Ignore all logfiles and tempfiles. /log/*.log /tmp +/logfile # Ignore Eclipse .project file /.project diff --git a/app/models/post.rb b/app/models/post.rb index 7c4525e61..bd20e8b96 100644 --- a/app/models/post.rb +++ b/app/models/post.rb @@ -4,7 +4,7 @@ require_dependency 'rate_limiter' require_dependency 'post_revisor' require_dependency 'enum' require_dependency 'trashable' -require_dependency 'post_analyser' +require_dependency 'post_analyzer' require 'archetype' require 'digest/sha1' @@ -12,7 +12,6 @@ require 'digest/sha1' class Post < ActiveRecord::Base include RateLimiter::OnCreateRecord include Trashable - include PostAnalyser versioned if: :raw_changed? @@ -90,11 +89,6 @@ class Post < ActiveRecord::Base Digest::SHA1.hexdigest(raw.gsub(/\s+/, "").downcase) end - def cooked_document - self.cooked ||= cook(raw, topic_id: topic_id) - @cooked_document ||= Nokogiri::HTML.fragment(cooked) - end - def reset_cooked @cooked_document = nil self.cooked = nil @@ -104,16 +98,18 @@ class Post < ActiveRecord::Base @white_listed_image_classes ||= ['avatar', 'favicon', 'thumbnail'] end - # How many images are present in the post - def image_count - return 0 unless raw.present? + def post_analyzer + @post_analyzer = PostAnalyzer.new(raw, topic_id) + end - cooked_document.search("img").reject do |t| - dom_class = t["class"] - if dom_class - (Post.white_listed_image_classes & dom_class.split(" ")).count > 0 - end - end.count + %w{raw_mentions linked_hosts image_count link_count raw_links}.each do |attr| + define_method(attr) do + PostAnalyzer.new(raw, topic_id).send(attr) + end + end + + def cook(*args) + PostAnalyzer.new(raw, topic_id).cook(*args) end @@ -236,20 +232,6 @@ class Post < ActiveRecord::Base Post.excerpt(cooked, maxlength, options) end - # What we use to cook posts - def cook(*args) - cooked = PrettyText.cook(*args) - - # If we have any of the oneboxes in the cache, throw them in right away, don't - # wait for the post processor. - dirty = false - result = Oneboxer.apply(cooked) do |url, elem| - Oneboxer.render_from_cache(url) - end - - cooked = result.to_html if result.changed? - cooked - end # A list of versions including the initial version def all_versions diff --git a/app/models/post_analyzer.rb b/app/models/post_analyzer.rb new file mode 100644 index 000000000..5431a16ce --- /dev/null +++ b/app/models/post_analyzer.rb @@ -0,0 +1,101 @@ +class PostAnalyzer + + attr_accessor :cooked, :raw + + def initialize(raw, topic_id) + @raw = raw + @topic_id = topic_id + end + + def cooked_document + @cooked = cook(@raw, topic_id: @topic_id) + @cooked_document = Nokogiri::HTML.fragment(@cooked) + end + + # What we use to cook posts + def cook(*args) + cooked = PrettyText.cook(*args) + + # If we have any of the oneboxes in the cache, throw them in right away, don't + # wait for the post processor. + dirty = false + result = Oneboxer.apply(cooked) do |url, elem| + Oneboxer.render_from_cache(url) + end + + cooked = result.to_html if result.changed? + cooked + end + + # How many images are present in the post + def image_count + return 0 unless @raw.present? + + cooked_document.search("img").reject do |t| + dom_class = t["class"] + if dom_class + (Post.white_listed_image_classes & dom_class.split(" ")).count > 0 + end + end.count + end + + def raw_mentions + return [] if @raw.blank? + + # We don't count mentions in quotes + return @raw_mentions if @raw_mentions.present? + raw_stripped = @raw.gsub(/\[quote=(.*)\]([^\[]*?)\[\/quote\]/im, '') + + # Strip pre and code tags + doc = Nokogiri::HTML.fragment(raw_stripped) + doc.search("pre").remove + doc.search("code").remove + + results = doc.to_html.scan(PrettyText.mention_matcher) + @raw_mentions = results.uniq.map { |un| un.first.downcase.gsub!(/^@/, '') } + end + + # Count how many hosts are linked in the post + def linked_hosts + return {} if raw_links.blank? + + return @linked_hosts if @linked_hosts.present? + + @linked_hosts = {} + raw_links.each do |u| + uri = URI.parse(u) + host = uri.host + @linked_hosts[host] ||= 1 + end + @linked_hosts + end + + # Returns an array of all links in a post excluding mentions + def raw_links + return [] unless @raw.present? + + return @raw_links if @raw_links.present? + + # Don't include @mentions in the link count + @raw_links = [] + cooked_document.search("a[href]").each do |l| + next if link_is_a_mention?(l) + url = l.attributes['href'].to_s + @raw_links << url + end + @raw_links + end + + # How many links are present in the post + def link_count + raw_links.size + end + + private + + def link_is_a_mention?(l) + html_class = l.attributes['class'] + return false if html_class.nil? + html_class.to_s == 'mention' && l.attributes['href'].to_s =~ /^\/users\// + end +end diff --git a/spec/models/post_analyzer_spec.rb b/spec/models/post_analyzer_spec.rb new file mode 100644 index 000000000..8b161c016 --- /dev/null +++ b/spec/models/post_analyzer_spec.rb @@ -0,0 +1,166 @@ +require 'spec_helper' + +describe PostAnalyzer do + + let(:topic) { Fabricate(:topic) } + let(:default_topic_id) { topic.id } + let(:post_args) do + {user: topic.user, topic: topic} + end + + context "links" do + let(:raw_no_links) { "hello world my name is evil trout" } + let(:raw_one_link_md) { "[jlawr](http://www.imdb.com/name/nm2225369)" } + let(:raw_two_links_html) { "disney reddit"} + let(:raw_three_links) { "http://discourse.org and http://discourse.org/another_url and http://www.imdb.com/name/nm2225369"} + + describe "raw_links" do + it "returns a blank collection for a post with no links" do + post_analyzer = PostAnalyzer.new(raw_no_links, default_topic_id) + post_analyzer.raw_links.should be_blank + end + + it "finds a link within markdown" do + post_analyzer = PostAnalyzer.new(raw_one_link_md, default_topic_id) + post_analyzer.raw_links.should == ["http://www.imdb.com/name/nm2225369"] + end + + it "can find two links from html" do + post_analyzer = PostAnalyzer.new(raw_two_links_html, default_topic_id) + post_analyzer.raw_links.should == ["http://disneyland.disney.go.com/", "http://reddit.com"] + end + + it "can find three links without markup" do + post_analyzer = PostAnalyzer.new(raw_three_links, default_topic_id) + post_analyzer.raw_links.should == ["http://discourse.org", "http://discourse.org/another_url", "http://www.imdb.com/name/nm2225369"] + end + end + + describe "linked_hosts" do + it "returns blank with no links" do + post_analyzer = PostAnalyzer.new(raw_no_links, default_topic_id) + post_analyzer.linked_hosts.should be_blank + end + + it "returns the host and a count for links" do + post_analyzer = PostAnalyzer.new(raw_two_links_html, default_topic_id) + post_analyzer.linked_hosts.should == {"disneyland.disney.go.com" => 1, "reddit.com" => 1} + end + + it "it counts properly with more than one link on the same host" do + post_analyzer = PostAnalyzer.new(raw_three_links, default_topic_id) + post_analyzer.linked_hosts.should == {"discourse.org" => 1, "www.imdb.com" => 1} + end + end + end + + describe "image_count" do + let(:raw_post_one_image_md) { "![sherlock](http://bbc.co.uk/sherlock.jpg)" } + let(:raw_post_two_images_html) { " " } + let(:raw_post_with_avatars) { 'smiley wink' } + let(:raw_post_with_favicon) { '' } + let(:raw_post_with_thumbnail) { '' } + let(:raw_post_with_two_classy_images) { " " } + + it "returns 0 images for an empty post" do + post_analyzer = PostAnalyzer.new("Hello world", nil) + post_analyzer.image_count.should == 0 + end + + it "finds images from markdown" do + post_analyzer = PostAnalyzer.new(raw_post_one_image_md, default_topic_id) + post_analyzer.image_count.should == 1 + end + + it "finds images from HTML" do + post_analyzer = PostAnalyzer.new(raw_post_two_images_html, default_topic_id) + post_analyzer.image_count.should == 2 + end + + it "doesn't count avatars as images" do + post_analyzer = PostAnalyzer.new(raw_post_with_avatars, default_topic_id) + post_analyzer.image_count.should == 0 + end + + it "doesn't count favicons as images" do + post_analyzer = PostAnalyzer.new(raw_post_with_favicon, default_topic_id) + post_analyzer.image_count.should == 0 + end + + it "doesn't count thumbnails as images" do + post_analyzer = PostAnalyzer.new(raw_post_with_thumbnail, default_topic_id) + post_analyzer.image_count.should == 0 + end + + it "doesn't count whitelisted images" do + Post.stubs(:white_listed_image_classes).returns(["classy"]) + post_analyzer = PostAnalyzer.new(raw_post_with_two_classy_images, default_topic_id) + post_analyzer.image_count.should == 0 + end + end + + describe "link_count" do + let(:raw_post_one_link_md) { "[sherlock](http://www.bbc.co.uk/programmes/b018ttws)" } + let(:raw_post_two_links_html) { "discourse twitter" } + let(:raw_post_with_mentions) { "hello @novemberkilo how are you doing?" } + + it "returns 0 links for an empty post" do + post_analyzer = PostAnalyzer.new("Hello world", nil) + post_analyzer.link_count.should == 0 + end + + it "returns 0 links for a post with mentions" do + post_analyzer = PostAnalyzer.new(raw_post_with_mentions, default_topic_id) + post_analyzer.link_count.should == 0 + end + + it "finds links from markdown" do + post_analyzer = PostAnalyzer.new(raw_post_one_link_md, default_topic_id) + post_analyzer.link_count.should == 1 + end + + it "finds links from HTML" do + post_analyzer = PostAnalyzer.new(raw_post_two_links_html, default_topic_id) + post_analyzer.link_count.should == 2 + end + end + + + describe "raw_mentions" do + + it "returns an empty array with no matches" do + post_analyzer = PostAnalyzer.new("Hello Jake and Finn!", default_topic_id) + post_analyzer.raw_mentions.should == [] + end + + it "returns lowercase unique versions of the mentions" do + post_analyzer = PostAnalyzer.new("@Jake @Finn @Jake", default_topic_id) + post_analyzer.raw_mentions.should == ['jake', 'finn'] + end + + it "ignores pre" do + post_analyzer = PostAnalyzer.new("
@Jake
@Finn", default_topic_id) + post_analyzer.raw_mentions.should == ['finn'] + end + + it "catches content between pre tags" do + post_analyzer = PostAnalyzer.new("
hello
@Finn
", default_topic_id)
+      post_analyzer.raw_mentions.should == ['finn']
+    end
+
+    it "ignores code" do
+      post_analyzer = PostAnalyzer.new("@Jake @Finn", default_topic_id)
+      post_analyzer.raw_mentions.should == ['jake']
+    end
+
+    it "ignores quotes" do
+      post_analyzer = PostAnalyzer.new("[quote=\"Evil Trout\"]@Jake[/quote] @Finn", default_topic_id)
+      post_analyzer.raw_mentions.should == ['finn']
+    end
+
+    it "handles underscore in username" do
+      post_analyzer = PostAnalyzer.new("@Jake @Finn @Jake_Old", default_topic_id)
+      post_analyzer.raw_mentions.should == ['jake', 'finn', 'jake_old']
+    end
+  end
+end