2013-02-05 14:16:51 -05:00
require 'open-uri'
2013-03-21 13:11:54 -04:00
require 'digest/sha1'
2013-02-05 14:16:51 -05:00
require_dependency 'oneboxer/base'
require_dependency 'oneboxer/whitelist'
2013-02-25 19:42:20 +03:00
Dir [ " #{ Rails . root } /lib/oneboxer/*_onebox.rb " ] . each { | f |
require_dependency ( f . split ( '/' ) [ - 2 .. - 1 ] . join ( '/' ) )
2013-02-05 14:16:51 -05:00
}
module Oneboxer
extend Oneboxer :: Base
2013-04-30 12:43:21 +10:00
# keep reloaders happy
unless defined? Oneboxer :: Result
Result = Struct . new ( :doc , :changed ) do
def to_html
doc . to_html
end
2013-04-10 17:52:38 +10:00
2013-04-30 12:43:21 +10:00
def changed?
changed
end
2013-04-10 17:52:38 +10:00
end
end
2013-03-21 11:47:01 -04:00
Dir [ " #{ Rails . root } /lib/oneboxer/*_onebox.rb " ] . sort . each do | f |
2013-02-05 14:16:51 -05:00
add_onebox " Oneboxer:: #{ Pathname . new ( f ) . basename . to_s . gsub ( / \ .rb$ / , '' ) . classify } " . constantize
end
def self . default_expiry
2013-03-21 13:11:54 -04:00
1 . day
2013-02-05 14:16:51 -05:00
end
# Return a oneboxer for a given URL
def self . onebox_for_url ( url )
2013-03-21 11:47:01 -04:00
matchers . each do | matcher |
regexp = matcher . regexp
klass = matcher . klass
2013-02-06 16:22:11 +11:00
regexp = regexp . call if regexp . class == Proc
2013-03-21 11:47:01 -04:00
return klass . new ( url ) if url =~ regexp
2013-02-05 14:16:51 -05:00
end
nil
end
# Retrieve the onebox for a url without caching
def self . onebox_nocache ( url )
oneboxer = onebox_for_url ( url )
return oneboxer . onebox if oneboxer . present?
2013-02-19 11:46:36 -05:00
whitelist_entry = Whitelist . entry_for_url ( url )
2013-02-05 14:16:51 -05:00
2013-02-19 11:46:36 -05:00
if whitelist_entry . present?
2013-05-01 16:37:27 +10:00
# TODO - only download HEAD section
# TODO - sane timeout
# TODO - FAIL if for any reason you are downloading more that 5000 bytes
2013-02-05 14:16:51 -05:00
page_html = open ( url ) . read
if page_html . present?
2013-02-12 09:46:45 -05:00
doc = Nokogiri :: HTML ( page_html )
2013-02-05 14:16:51 -05:00
2013-02-19 11:46:36 -05:00
if whitelist_entry . allows_oembed?
# See if if it has an oembed thing we can use
( doc / " link[@type='application/json+oembed'] " ) . each do | oembed |
return OembedOnebox . new ( oembed [ :href ] ) . onebox
end
( doc / " link[@type='text/json+oembed'] " ) . each do | oembed |
2013-02-26 10:42:49 -05:00
return OembedOnebox . new ( oembed [ :href ] ) . onebox
2013-02-19 11:46:36 -05:00
end
2013-02-17 04:10:17 -05:00
end
2013-02-05 14:16:51 -05:00
# Check for opengraph
open_graph = Oneboxer . parse_open_graph ( doc )
return OpenGraphOnebox . new ( url , open_graph ) . onebox if open_graph . present?
end
end
2013-02-17 04:10:17 -05:00
nil
rescue OpenURI :: HTTPError
nil
2013-02-05 14:16:51 -05:00
end
# Parse URLs out of HTML, returning the document when finished.
def self . each_onebox_link ( string_or_doc )
doc = string_or_doc
2013-04-10 17:52:38 +10:00
doc = Nokogiri :: HTML :: fragment ( doc ) if doc . is_a? ( String )
2013-02-05 14:16:51 -05:00
onebox_links = doc . search ( " a.onebox " )
if onebox_links . present?
onebox_links . each do | link |
if link [ 'href' ] . present?
yield link [ 'href' ] , link
end
end
end
doc
end
2013-04-10 17:52:38 +10:00
def self . apply ( string_or_doc )
doc = string_or_doc
doc = Nokogiri :: HTML :: fragment ( doc ) if doc . is_a? ( String )
changed = false
Oneboxer . each_onebox_link ( doc ) do | url , element |
onebox , preview = yield ( url , element )
if onebox
parsed_onebox = Nokogiri :: HTML :: fragment ( onebox )
2013-05-01 16:37:27 +10:00
next unless parsed_onebox . children . count > 0
2013-04-10 17:52:38 +10:00
# special logic to strip empty p elements
2013-05-01 16:37:27 +10:00
if element . parent &&
element . parent . node_name . downcase == " p " &&
element . parent . children . count == 1 &&
2013-04-10 17:52:38 +10:00
parsed_onebox . children . first . name . downcase == " div "
2013-05-01 16:37:27 +10:00
element = element . parent
2013-04-10 17:52:38 +10:00
end
changed = true
element . swap parsed_onebox . to_html
end
end
Result . new ( doc , changed )
end
2013-03-21 13:11:54 -04:00
def self . cache_key_for ( url )
" onebox: #{ Digest :: SHA1 . hexdigest ( url ) } "
2013-02-05 14:16:51 -05:00
end
2013-03-21 20:47:44 -04:00
def self . preview_cache_key_for ( url )
" onebox:preview: #{ Digest :: SHA1 . hexdigest ( url ) } "
end
2013-03-21 13:11:54 -04:00
def self . render_from_cache ( url )
Rails . cache . read ( cache_key_for ( url ) )
2013-02-05 14:16:51 -05:00
end
# Cache results from a onebox call
def self . fetch_and_cache ( url , args )
2013-03-21 20:47:44 -04:00
contents , preview = onebox_nocache ( url )
2013-03-21 13:11:54 -04:00
return nil if contents . blank?
2013-02-05 14:16:51 -05:00
2013-03-21 13:11:54 -04:00
Rails . cache . write ( cache_key_for ( url ) , contents , expires_in : default_expiry )
2013-03-21 20:47:44 -04:00
if preview . present?
Rails . cache . write ( preview_cache_key_for ( url ) , preview , expires_in : default_expiry )
end
[ contents , preview ]
2013-02-05 14:16:51 -05:00
end
def self . invalidate ( url )
2013-03-21 13:11:54 -04:00
Rails . cache . delete ( cache_key_for ( url ) )
2013-02-05 14:16:51 -05:00
end
2013-03-21 20:47:44 -04:00
def self . preview ( url , args = { } )
# Look for a preview
cached = Rails . cache . read ( preview_cache_key_for ( url ) ) unless args [ :no_cache ] . present?
return cached if cached . present?
# Try the full version
cached = render_from_cache ( url )
return cached if cached . present?
# If that fails, look it up
contents , cached = fetch_and_cache ( url , args )
return cached if cached . present?
contents
end
2013-02-05 14:16:51 -05:00
# Return the cooked content for a url, caching the result for performance
def self . onebox ( url , args = { } )
2013-03-21 13:11:54 -04:00
if args [ :invalidate_oneboxes ]
2013-02-05 14:16:51 -05:00
# Remove the onebox from the cache
Oneboxer . invalidate ( url )
else
2013-03-21 13:11:54 -04:00
contents = render_from_cache ( url )
return contents if contents . present?
2013-02-05 14:16:51 -05:00
end
2013-03-21 13:11:54 -04:00
fetch_and_cache ( url , args )
2013-02-05 14:16:51 -05:00
end
end