Replace Hpricot with Nokogiri

This commit is contained in:
Jaime Iniesta 2013-02-12 09:46:45 -05:00
parent 84a167725d
commit 6995e75d41
15 changed files with 36 additions and 34 deletions

View file

@ -20,7 +20,6 @@ gem 'fastimage'
gem 'fog', require: false gem 'fog', require: false
gem 'has_ip_address' gem 'has_ip_address'
gem 'hiredis' gem 'hiredis'
gem 'hpricot'
gem 'i18n-js' gem 'i18n-js'
gem 'jquery-rails' gem 'jquery-rails'
gem 'multi_json' gem 'multi_json'

View file

@ -192,7 +192,6 @@ GEM
highline (1.6.15) highline (1.6.15)
hike (1.2.1) hike (1.2.1)
hiredis (0.4.5) hiredis (0.4.5)
hpricot (0.8.6)
httpauth (0.2.0) httpauth (0.2.0)
i18n (0.6.1) i18n (0.6.1)
i18n-js (2.1.2) i18n-js (2.1.2)
@ -464,7 +463,6 @@ DEPENDENCIES
guard-spork guard-spork
has_ip_address has_ip_address
hiredis hiredis
hpricot
i18n-js i18n-js
image_optim image_optim
jasminerice jasminerice

View file

@ -4,7 +4,6 @@ require_dependency 'rate_limiter'
require_dependency 'post_revisor' require_dependency 'post_revisor'
require 'archetype' require 'archetype'
require 'hpricot'
require 'digest/sha1' require 'digest/sha1'
class Post < ActiveRecord::Base class Post < ActiveRecord::Base

View file

@ -31,7 +31,7 @@ The following Ruby Gems are used in Discourse:
* [vestal_versions](https://rubygems.org/gems/vestal_versions) * [vestal_versions](https://rubygems.org/gems/vestal_versions)
* [coffee-rails](https://rubygems.org/gems/coffee-rails) * [coffee-rails](https://rubygems.org/gems/coffee-rails)
* [uglifier](https://rubygems.org/gems/uglifier) * [uglifier](https://rubygems.org/gems/uglifier)
* [hpricot](https://rubygems.org/gems/hpricot) * [nokogiri](https://rubygems.org/gems/nokogiri)
* [uuidtools](https://rubygems.org/gems/uuidtools) * [uuidtools](https://rubygems.org/gems/uuidtools)
* [rinku](https://rubygems.org/gems/rinku) * [rinku](https://rubygems.org/gems/rinku)
* [ruby-openid](https://rubygems.org/gems/ruby-openid) * [ruby-openid](https://rubygems.org/gems/ruby-openid)

View file

@ -9,7 +9,7 @@ class CookedPostProcessor
@dirty = false @dirty = false
@opts = opts @opts = opts
@post = post @post = post
@doc = Hpricot(post.cooked) @doc = Nokogiri::HTML(post.cooked)
end end
def dirty? def dirty?

View file

@ -34,7 +34,7 @@ module Oneboxer
if Whitelist.allowed?(url) if Whitelist.allowed?(url)
page_html = open(url).read page_html = open(url).read
if page_html.present? if page_html.present?
doc = Hpricot(page_html) doc = Nokogiri::HTML(page_html)
# See if if it has an oembed thing we can use # See if if it has an oembed thing we can use
(doc/"link[@type='application/json+oembed']").each do |oembed| (doc/"link[@type='application/json+oembed']").each do |oembed|
@ -56,7 +56,7 @@ module Oneboxer
# Parse URLs out of HTML, returning the document when finished. # Parse URLs out of HTML, returning the document when finished.
def self.each_onebox_link(string_or_doc) def self.each_onebox_link(string_or_doc)
doc = string_or_doc doc = string_or_doc
doc = Hpricot(doc) if doc.is_a?(String) doc = Nokogiri::HTML(doc) if doc.is_a?(String)
onebox_links = doc.search("a.onebox") onebox_links = doc.search("a.onebox")
if onebox_links.present? if onebox_links.present?

View file

@ -22,19 +22,19 @@ module Oneboxer
end end
def parse(data) def parse(data)
hp = Hpricot(data) html_doc = Nokogiri::HTML(data)
result = {} result = {}
result[:title] = hp.at("h1") result[:title] = html_doc.at("h1")
result[:title] = result[:title].inner_html if result[:title].present? result[:title] = result[:title].inner_html if result[:title].present?
image = hp.at(".main-image img") image = html_doc.at(".main-image img")
result[:image] = image['src'] if image result[:image] = image['src'] if image
result[:by_info] = hp.at("#by-line") result[:by_info] = html_doc.at("#by-line")
result[:by_info] = BaseOnebox.remove_whitespace(result[:by_info].inner_html) if result[:by_info].present? result[:by_info] = BaseOnebox.remove_whitespace(result[:by_info].inner_html) if result[:by_info].present?
summary = hp.at("#description-and-details-content") summary = html_doc.at("#description-and-details-content")
result[:text] = summary.inner_html if summary.present? result[:text] = summary.inner_html if summary.present?
result result

View file

@ -12,20 +12,20 @@ module Oneboxer
def parse(data) def parse(data)
hp = Hpricot(data) html_doc = Nokogiri::HTML(data)
result = {} result = {}
m = hp.at("h1.doc-banner-title") m = html_doc.at("h1.doc-banner-title")
result[:title] = m.inner_text if m result[:title] = m.inner_text if m
m = hp.at("div#doc-original-text") m = html_doc.at("div#doc-original-text")
if m if m
result[:text] = BaseOnebox.replace_tags_with_spaces(m.inner_html) result[:text] = BaseOnebox.replace_tags_with_spaces(m.inner_html)
result[:text] = result[:text][0..MAX_TEXT] result[:text] = result[:text][0..MAX_TEXT]
end end
m = hp.at("div.doc-banner-icon img") m = html_doc.at("div.doc-banner-icon img")
result[:image] = m['src'] if m result[:image] = m['src'] if m
result result

View file

@ -17,17 +17,17 @@ module Oneboxer
def parse(data) def parse(data)
hp = Hpricot(data) html_doc = Nokogiri::HTML(data)
result = {} result = {}
m = hp.at("h1") m = html_doc.at("h1")
result[:title] = m.inner_text if m result[:title] = m.inner_text if m
m = hp.at("h4 ~ p") m = html_doc.at("h4 ~ p")
result[:text] = m.inner_text[0..MAX_TEXT] if m result[:text] = m.inner_text[0..MAX_TEXT] if m
m = hp.at(".product img.artwork") m = html_doc.at(".product img.artwork")
result[:image] = m['src'] if m result[:image] = m['src'] if m
result result

View file

@ -9,7 +9,7 @@ module Oneboxer
page_html = open(@url).read page_html = open(@url).read
return nil if page_html.blank? return nil if page_html.blank?
doc = Hpricot(page_html) doc = Nokogiri::HTML(page_html)
# Flikrs oembed just stopped returning images for no reason. Let's use opengraph instead. # Flikrs oembed just stopped returning images for no reason. Let's use opengraph instead.
open_graph = Oneboxer.parse_open_graph(doc) open_graph = Oneboxer.parse_open_graph(doc)

View file

@ -20,23 +20,23 @@ module Oneboxer
def parse(data) def parse(data)
hp = Hpricot(data) html_doc = Nokogiri::HTML(data)
result = {} result = {}
title = hp.at('title').inner_html title = html_doc.at('title').inner_html
result[:title] = title.gsub!(/ - Wikipedia, the free encyclopedia/, '') if title.present? result[:title] = title.gsub!(/ - Wikipedia, the free encyclopedia/, '') if title.present?
# get the first image > 150 pix high # get the first image > 150 pix high
images = hp.search("img").select { |img| img['height'].to_i > 150 } images = html_doc.search("img").select { |img| img['height'].to_i > 150 }
result[:image] = "http:#{images[0]["src"]}" unless images.empty? result[:image] = "http:#{images[0]["src"]}" unless images.empty?
# remove the table from mobile layout, as it can contain paras in some rare cases # remove the table from mobile layout, as it can contain paras in some rare cases
hp.search("table").remove html_doc.search("table").remove
# get all the paras # get all the paras
paras = hp.search("p") paras = html_doc.search("p")
text = "" text = ""
unless paras.empty? unless paras.empty?

View file

@ -19,7 +19,10 @@ describe CookedPostProcessor do
end end
it 'inserts the onebox' do it 'inserts the onebox' do
@cpp.html.should == "GANGNAM STYLE" @cpp.html.should == <<EXPECTED
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><body>GANGNAM STYLE</body></html>
EXPECTED
end end
end end

View file

@ -26,7 +26,7 @@ private
<h3><a href="http://www.amazon.com/Ruby-Programming-Language-David-Flanagan/dp/0596516177" target="_blank">The Ruby Programming Language (Paperback)</a></h3> <h3><a href="http://www.amazon.com/Ruby-Programming-Language-David-Flanagan/dp/0596516177" target="_blank">The Ruby Programming Language (Paperback)</a></h3>
<h4>David Flanagan, Yukihiro Matsumoto</h4> <h4>David Flanagan, Yukihiro Matsumoto</h4>
The Ruby Programming Language is the authoritative guide to Ruby&nbsp;... The Ruby Programming Language is the authoritative guide to Ruby ...
</div> </div>
<div class='clearfix'></div> <div class='clearfix'></div>

View file

@ -25,7 +25,10 @@ private
<img src="https://lh5.ggpht.com/wrYYVu74XNUu2WHk0aSZEqgdCDCNti9Fl0_dJnhgR6jY04ajQgVg5ABMatfcTDsB810=w124" class="thumbnail"> <img src="https://lh5.ggpht.com/wrYYVu74XNUu2WHk0aSZEqgdCDCNti9Fl0_dJnhgR6jY04ajQgVg5ABMatfcTDsB810=w124" class="thumbnail">
<h3><a href="https://play.google.com/store/apps/details?id=com.moosoft.parrot" target="_blank">Talking Parrot</a></h3> <h3><a href="https://play.google.com/store/apps/details?id=com.moosoft.parrot" target="_blank">Talking Parrot</a></h3>
Listen to the parrot repeat what you say. A Fun application for all ages. Upgrade to Talking Parrot Pro to save sounds, set them as your ringtone and control recording. Press the MENU button to access the settings where you can change the record time and repeat count. This app uses anonymous usage stats to understand and improve performance. Comments and feedback welcome. Listen to the parrot repeat what you say. A Fun application for all ages. Upgrade to Talking Parrot Pro to save sounds, set them as your ringtone and control recording.
Press the MENU button to access the settings where you can change the record time and repeat count.
This app uses anonymous usage stats to understand and improve performance.
Comments and feedback welcome.
</div> </div>
<div class='clearfix'></div> <div class='clearfix'></div>
</div> </div>

View file

@ -145,16 +145,16 @@ describe Oneboxer do
it 'yields each url and element when given a string' do it 'yields each url and element when given a string' do
result = Oneboxer.each_onebox_link(@html) do |url, element| result = Oneboxer.each_onebox_link(@html) do |url, element|
element.is_a?(Hpricot::Elem).should be_true element.is_a?(Nokogiri::XML::Element).should be_true
url.should == 'http://discourse.org' url.should == 'http://discourse.org'
end end
result.kind_of?(Hpricot::Doc).should be_true result.kind_of?(Nokogiri::HTML::Document).should be_true
end end
it 'yields each url and element when given a doc' do it 'yields each url and element when given a doc' do
doc = Hpricot(@html) doc = Nokogiri::HTML(@html)
Oneboxer.each_onebox_link(doc) do |url, element| Oneboxer.each_onebox_link(doc) do |url, element|
element.is_a?(Hpricot::Elem).should be_true element.is_a?(Nokogiri::XML::Element).should be_true
url.should == 'http://discourse.org' url.should == 'http://discourse.org'
end end
end end