diff --git a/lib/oneboxer.rb b/lib/oneboxer.rb index 01b7da6ad..c048023cd 100644 --- a/lib/oneboxer.rb +++ b/lib/oneboxer.rb @@ -30,18 +30,22 @@ module Oneboxer def self.onebox_nocache(url) oneboxer = onebox_for_url(url) return oneboxer.onebox if oneboxer.present? + + whitelist_entry = Whitelist.entry_for_url(url) - if Whitelist.allowed?(url) + if whitelist_entry.present? page_html = open(url).read if page_html.present? doc = Nokogiri::HTML(page_html) - # See if if it has an oembed thing we can use - (doc/"link[@type='application/json+oembed']").each do |oembed| - return OembedOnebox.new(oembed[:href]).onebox - end - (doc/"link[@type='text/json+oembed']").each do |oembed| - return OembedOnebox.new(oembed[:href]).onebox + if whitelist_entry.allows_oembed? + # See if if it has an oembed thing we can use + (doc/"link[@type='application/json+oembed']").each do |oembed| + return OembedOnebox.new(oembed[:href]).onebox + end + (doc/"link[@type='text/json+oembed']").each do |oembed| + return OembedOnebox.new(oembed[:href]).onebox + end end # Check for opengraph diff --git a/lib/oneboxer/clikthrough_onebox.rb b/lib/oneboxer/clikthrough_onebox.rb index 8561c6555..7869ea895 100644 --- a/lib/oneboxer/clikthrough_onebox.rb +++ b/lib/oneboxer/clikthrough_onebox.rb @@ -3,7 +3,7 @@ require_dependency 'oneboxer/oembed_onebox' module Oneboxer class ClikthroughOnebox < OembedOnebox - matcher /clikthrough\.com\/theater\/video\/\d+$/ + matcher /^https?:\/\/(?:www\.)?clikthrough\.com\/theater\/video\/\d+$/ def oembed_endpoint "http://clikthrough.com/services/oembed?url=#{BaseOnebox.uriencode(@url)}" diff --git a/lib/oneboxer/dailymotion_onebox.rb b/lib/oneboxer/dailymotion_onebox.rb index 83ae8647c..fccabbb02 100644 --- a/lib/oneboxer/dailymotion_onebox.rb +++ b/lib/oneboxer/dailymotion_onebox.rb @@ -3,7 +3,7 @@ require_dependency 'oneboxer/oembed_onebox' module Oneboxer class DailymotionOnebox < OembedOnebox - matcher /dailymotion\.com\/.+$/ + matcher /^https?:\/\/(?:www\.)?dailymotion\.com\/.+$/ def oembed_endpoint "http://www.dailymotion.com/api/oembed/?url=#{BaseOnebox.uriencode(@url)}" diff --git a/lib/oneboxer/dotsub_onebox.rb b/lib/oneboxer/dotsub_onebox.rb index 2d524e937..96bbabe5b 100644 --- a/lib/oneboxer/dotsub_onebox.rb +++ b/lib/oneboxer/dotsub_onebox.rb @@ -3,7 +3,7 @@ require_dependency 'oneboxer/oembed_onebox' module Oneboxer class DotsubOnebox < OembedOnebox - matcher /dotsub\.com\/.+$/ + matcher /^https?:\/\/(?:www\.)?dotsub\.com\/.+$/ def oembed_endpoint "http://dotsub.com/services/oembed?url=#{BaseOnebox.uriencode(@url)}" diff --git a/lib/oneboxer/github_blob_onebox.rb b/lib/oneboxer/github_blob_onebox.rb index 59fb282dc..859ccf15c 100644 --- a/lib/oneboxer/github_blob_onebox.rb +++ b/lib/oneboxer/github_blob_onebox.rb @@ -3,7 +3,7 @@ require_dependency 'oneboxer/handlebars_onebox' module Oneboxer class GithubBlobOnebox < HandlebarsOnebox - matcher /github\.com\/[^\/]+\/[^\/]+\/blob\/.*/ + matcher /^https?:\/\/(?:www\.)?github\.com\/[^\/]+\/[^\/]+\/blob\/.*/ favicon 'github.png' def translate_url diff --git a/lib/oneboxer/kinomap_onebox.rb b/lib/oneboxer/kinomap_onebox.rb index 1e1af58de..cb427ee4f 100644 --- a/lib/oneboxer/kinomap_onebox.rb +++ b/lib/oneboxer/kinomap_onebox.rb @@ -3,7 +3,7 @@ require_dependency 'oneboxer/oembed_onebox' module Oneboxer class KinomapOnebox < OembedOnebox - matcher /kinomap\.com/ + matcher /^https?:\/\/(?:www\.)?kinomap\.com/ def oembed_endpoint "http://www.kinomap.com/oembed?url=#{BaseOnebox.uriencode(@url)}&format=json" diff --git a/lib/oneboxer/nfb_onebox.rb b/lib/oneboxer/nfb_onebox.rb index f8677f0d4..f749c097f 100644 --- a/lib/oneboxer/nfb_onebox.rb +++ b/lib/oneboxer/nfb_onebox.rb @@ -3,7 +3,7 @@ require_dependency 'oneboxer/oembed_onebox' module Oneboxer class NfbOnebox < OembedOnebox - matcher /nfb\.ca\/film\/[-\w]+\/?/ + matcher /^https?:\/\/(?:www\.)?nfb\.ca\/film\/[-\w]+\/?/ def oembed_endpoint "http://www.nfb.ca/remote/services/oembed/?url=#{BaseOnebox.uriencode(@url)}&format=json" diff --git a/lib/oneboxer/viddler_onebox.rb b/lib/oneboxer/viddler_onebox.rb index 987e05cd1..31a150b5f 100644 --- a/lib/oneboxer/viddler_onebox.rb +++ b/lib/oneboxer/viddler_onebox.rb @@ -3,7 +3,7 @@ require_dependency 'oneboxer/oembed_onebox' module Oneboxer class ViddlerOnebox < OembedOnebox - matcher /viddler\.com\/.+$/ + matcher /^https?:\/\/(?:www\.)?viddler\.com\/.+$/ def oembed_endpoint "http://lab.viddler.com/services/oembed/?url=#{BaseOnebox.uriencode(@url)}" diff --git a/lib/oneboxer/whitelist.rb b/lib/oneboxer/whitelist.rb index 199a42c74..6fe9833fc 100644 --- a/lib/oneboxer/whitelist.rb +++ b/lib/oneboxer/whitelist.rb @@ -2,85 +2,103 @@ module Oneboxer module Whitelist def self.entries - [/cnn\.com\/.+/, - /washingtonpost\.com\/.+/, - /\/\d{4}\/\d{2}\/\d{2}\//, # wordpress - /funnyordie\.com\/.+/, - /youtube\.com\/.+/, - /youtu\.be\/.+/, - /500px\.com\/.+/, - /scribd\.com\/.+/, - /photobucket\.com\/.+/, - /ebay\.(com|ca|co\.uk)\/.+/, - /nytimes\.com\/.+/, - /tumblr\.com\/.+/, - /pinterest\.com\/.+/, - /imdb\.com\/.+/, - /bbc\.co\.uk\/.+/, - /ask\.com\/.+/, - /huffingtonpost\.com\/.+/, - /aol\.(com|ca)\/.+/, - /espn\.go\.com\/.+/, - /about\.com\/.+/, - /cnet\.com\/.+/, - /ehow\.com\/.+/, - /dailymail\.co\.uk\/.+/, - /indiatimes\.com\/.+/, - /answers\.com\/.+/, - /instagr\.am\/.+/, - /battle\.net\/.+/, - /sourceforge\.net\/.+/, - /myspace\.com\/.+/, - /wikia\.com\/.+/, - /etsy\.com\/.+/, - /walmart\.com\/.+/, - /reference\.com\/.+/, - /yelp\.com\/.+/, - /foxnews\.com\/.+/, - /guardian\.co\.uk\/.+/, - /digg\.com\/.+/, - /squidoo\.com\/.+/, - /wsj\.com\/.+/, - /archive\.org\/.+/, - /nba\.com\/.+/, - /samsung\.com\/.+/, - /mashable\.com\/.+/, - /forbes\.com\/.+/, - /soundcloud\.com\/.+/, - /thefreedictionary\.com\/.+/, - /groupon\.com\/.+/, - /ikea\.com\/.+/, - /dell\.com\/.+/, - /mlb\.com\/.+/, - /bestbuy\.(com|ca)\/.+/, - /bloomberg\.com\/.+/, - /ign\.com\/.+/, - /twitpic\.com\/.+/, - /techcrunch\.com\/.+/, - /usatoday\.com\/.+/, - /go\.com\/.+/, - /businessinsider\.com\/.+/, - /zillow\.com\/.+/, - /tmz\.com\/.+/, - /thesun\.co\.uk\/.+/, - /thestar\.(com|ca)\/.+/, - /theglobeandmail\.com\/.+/, - /torontosun\.com\/.+/, - /kickstarter\.com\/.+/, - /wired\.com\/.+/, - /time\.com\/.+/, - /npr\.org\/.+/, - /cracked\.com\/.+/, - /thinkgeek\.com\/.+/, - /deadline\.com\/.+/ - ] - end - - def self.allowed?(url) - #return true - entries.each {|e| return true if url =~ e } - false + [ + Entry.new(/^https?:\/\/(?:www\.)?cnn\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?washingtonpost\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?funnyordie\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?youtube\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?youtu\.be\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?500px\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?scribd\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?photobucket\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?ebay\.(com|ca|co\.uk)\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?nytimes\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?pinterest\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?imdb\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?bbc\.co\.uk\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?ask\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?huffingtonpost\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?aol\.(com|ca)\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?espn\.go\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?about\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?cnet\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?ehow\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?dailymail\.co\.uk\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?indiatimes\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?answers\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?instagr\.am\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?battle\.net\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?sourceforge\.net\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?myspace\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?wikia\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?etsy\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?walmart\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?reference\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?yelp\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?foxnews\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?guardian\.co\.uk\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?digg\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?squidoo\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?wsj\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?archive\.org\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?nba\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?samsung\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?mashable\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?forbes\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?soundcloud\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?thefreedictionary\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?groupon\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?ikea\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?dell\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?mlb\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?bestbuy\.(com|ca)\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?bloomberg\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?ign\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?twitpic\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?techcrunch\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?usatoday\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?go\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?businessinsider\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?zillow\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?tmz\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?thesun\.co\.uk\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?thestar\.(com|ca)\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?theglobeandmail\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?torontosun\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?kickstarter\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?wired\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?time\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?npr\.org\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?cracked\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?deadline\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?thinkgeek\.com\/.+/), + Entry.new(/^https?:\/\/(?:www\.)?tumblr\.com\/.+/, false), + Entry.new(/\/\d{4}\/\d{2}\/\d{2}\//, false), # wordpress + ] end + + def self.entry_for_url(url) + entries.each {|e| return e if e.matches?(url) } + nil + end + + private + + class Entry + # oembed = false is probably safer, but this is the least-drastic change + def initialize(pattern, oembed = true) + @pattern = pattern + @oembed = oembed + end + + def allows_oembed? + @oembed + end + + def matches?(url) + url =~ @pattern + end + end end diff --git a/lib/oneboxer/yfrog_onebox.rb b/lib/oneboxer/yfrog_onebox.rb index 6090930d9..96c72918c 100644 --- a/lib/oneboxer/yfrog_onebox.rb +++ b/lib/oneboxer/yfrog_onebox.rb @@ -3,7 +3,7 @@ require_dependency 'oneboxer/oembed_onebox' module Oneboxer class YfrogOnebox < OembedOnebox - matcher /yfrog\.(com|ru|com\.tr|it|fr|co\.il|co\.uk|com\.pl|pl|eu|us)\/[a-zA-Z0-9]+/ + matcher /^https?:\/\/(?:www\.)?yfrog\.(com|ru|com\.tr|it|fr|co\.il|co\.uk|com\.pl|pl|eu|us)\/[a-zA-Z0-9]+/ def oembed_endpoint "http://www.yfrog.com/api/oembed/?url=#{BaseOnebox.uriencode(@url)}&format=json"