2013-12-31 14:37:43 -05:00
require_dependency 'nokogiri'
class TopicEmbed < ActiveRecord :: Base
belongs_to :topic
belongs_to :post
validates_presence_of :embed_url
2015-06-15 12:08:55 -04:00
validates_uniqueness_of :embed_url
2013-12-31 14:37:43 -05:00
2014-03-19 16:33:21 -04:00
def self . normalize_url ( url )
2014-04-03 15:35:31 -04:00
url . downcase . sub ( / \/ $ / , '' ) . sub ( / \ -+ / , '-' ) . strip
2014-03-19 16:33:21 -04:00
end
2014-04-02 15:54:21 -04:00
def self . imported_from_html ( url )
" \n <hr> \n <small> #{ I18n . t ( 'embed.imported_from' , link : " <a href=' #{ url } '> #{ url } </a> " ) } </small> \n "
end
2013-12-31 14:37:43 -05:00
# Import an article from a source (RSS/Atom/Other)
def self . import ( user , url , title , contents )
return unless url =~ / ^https? \ : \/ \/ /
2014-03-18 18:02:33 -04:00
if SiteSetting . embed_truncate
contents = first_paragraph_from ( contents )
end
2015-05-06 12:40:24 -04:00
contents || = ''
2014-04-02 15:54:21 -04:00
contents << imported_from_html ( url )
2013-12-31 14:37:43 -05:00
2014-03-27 04:24:57 +01:00
url = normalize_url ( url )
2014-05-06 14:41:59 +01:00
embed = TopicEmbed . find_by ( " lower(embed_url) = ? " , url )
2013-12-31 14:37:43 -05:00
content_sha1 = Digest :: SHA1 . hexdigest ( contents )
post = nil
# If there is no embed, create a topic, post and the embed.
if embed . blank?
Topic . transaction do
2014-01-23 22:25:48 -07:00
creator = PostCreator . new ( user ,
title : title ,
raw : absolutize_urls ( url , contents ) ,
skip_validations : true ,
cook_method : Post . cook_methods [ :raw_html ] ,
category : SiteSetting . embed_category )
2013-12-31 14:37:43 -05:00
post = creator . create
if post . present?
TopicEmbed . create! ( topic_id : post . topic_id ,
embed_url : url ,
content_sha1 : content_sha1 ,
post_id : post . id )
end
end
else
2014-03-18 18:02:33 -04:00
absolutize_urls ( url , contents )
2013-12-31 14:37:43 -05:00
post = embed . post
# Update the topic if it changed
2014-05-08 16:44:32 -04:00
if post && post . topic && content_sha1 != embed . content_sha1
2014-10-27 22:06:43 +01:00
post . revise ( user , { raw : absolutize_urls ( url , contents ) } , skip_validations : true , bypass_rate_limiter : true )
2013-12-31 14:37:43 -05:00
embed . update_column ( :content_sha1 , content_sha1 )
end
end
post
end
2014-04-01 18:16:56 -04:00
def self . find_remote ( url )
2013-12-31 14:37:43 -05:00
require 'ruby-readability'
2014-03-19 16:33:21 -04:00
url = normalize_url ( url )
2014-04-02 15:54:21 -04:00
original_uri = URI . parse ( url )
2014-04-15 14:06:51 +10:00
opts = {
tags : %w[ div p code pre h1 h2 h3 b em i strong a img ul li ol blockquote ] ,
attributes : %w[ href src ] ,
remove_empty_nodes : false
}
opts [ :whitelist ] = SiteSetting . embed_whitelist_selector if SiteSetting . embed_whitelist_selector . present?
opts [ :blacklist ] = SiteSetting . embed_blacklist_selector if SiteSetting . embed_blacklist_selector . present?
doc = Readability :: Document . new ( open ( url ) . read , opts )
2014-04-02 15:54:21 -04:00
tags = { 'img' = > 'src' , 'script' = > 'src' , 'a' = > 'href' }
title = doc . title
doc = Nokogiri :: HTML ( doc . content )
doc . search ( tags . keys . join ( ',' ) ) . each do | node |
url_param = tags [ node . name ]
src = node [ url_param ]
2015-04-22 16:52:02 -07:00
unless ( src . nil? || src . empty? )
2014-04-09 11:04:45 -04:00
begin
uri = URI . parse ( src )
unless uri . host
uri . scheme = original_uri . scheme
uri . host = original_uri . host
node [ url_param ] = uri . to_s
end
rescue URI :: InvalidURIError
# If there is a mistyped URL, just do nothing
2014-04-02 15:54:21 -04:00
end
end
end
[ title , doc . to_html ]
2014-04-01 18:16:56 -04:00
end
2013-12-31 14:37:43 -05:00
2014-04-01 18:16:56 -04:00
def self . import_remote ( user , url , opts = nil )
opts = opts || { }
2014-04-02 15:54:21 -04:00
title , body = find_remote ( url )
TopicEmbed . import ( user , url , opts [ :title ] || title , body )
2013-12-31 14:37:43 -05:00
end
# Convert any relative URLs to absolute. RSS is annoying for this.
def self . absolutize_urls ( url , contents )
2014-03-19 16:33:21 -04:00
url = normalize_url ( url )
2013-12-31 14:37:43 -05:00
uri = URI ( url )
prefix = " #{ uri . scheme } :// #{ uri . host } "
prefix << " : #{ uri . port } " if uri . port != 80 && uri . port != 443
2014-03-18 18:02:33 -04:00
fragment = Nokogiri :: HTML . fragment ( " <div> #{ contents } </div> " )
2013-12-31 14:37:43 -05:00
fragment . css ( 'a' ) . each do | a |
href = a [ 'href' ]
if href . present? && href . start_with? ( '/' )
a [ 'href' ] = " #{ prefix } / #{ href . sub ( / ^ \/ + / , '' ) } "
end
end
fragment . css ( 'img' ) . each do | a |
src = a [ 'src' ]
if src . present? && src . start_with? ( '/' )
a [ 'src' ] = " #{ prefix } / #{ src . sub ( / ^ \/ + / , '' ) } "
end
end
2014-03-18 18:02:33 -04:00
fragment . at ( 'div' ) . inner_html
2013-12-31 14:37:43 -05:00
end
def self . topic_id_for_embed ( embed_url )
2014-03-19 16:33:21 -04:00
embed_url = normalize_url ( embed_url )
2014-03-23 15:22:02 -04:00
TopicEmbed . where ( " lower(embed_url) = ? " , embed_url ) . pluck ( :topic_id ) . first
2013-12-31 14:37:43 -05:00
end
2014-03-18 18:02:33 -04:00
def self . first_paragraph_from ( html )
doc = Nokogiri :: HTML ( html )
result = " "
doc . css ( 'p' ) . each do | p |
if p . text . present?
result << p . to_s
return result if result . size > = 100
end
end
return result unless result . blank?
# If there is no first paragaph, return the first div (onebox)
doc . css ( 'div' ) . first
end
2014-04-03 11:30:43 -04:00
def self . expanded_for ( post )
Rails . cache . fetch ( " embed-topic: #{ post . topic_id } " , expires_in : 10 . minutes ) do
url = TopicEmbed . where ( topic_id : post . topic_id ) . pluck ( :embed_url ) . first
2014-08-19 18:10:23 +05:30
_title , body = TopicEmbed . find_remote ( url )
2014-04-03 11:30:43 -04:00
body << TopicEmbed . imported_from_html ( url )
body
end
end
2013-12-31 14:37:43 -05:00
end
2014-02-07 11:07:36 +11:00
# == Schema Information
#
# Table name: topic_embeds
#
# id :integer not null, primary key
# topic_id :integer not null
# post_id :integer not null
# embed_url :string(255) not null
2014-04-08 17:35:44 +02:00
# content_sha1 :string(40)
2014-08-27 15:19:25 +10:00
# created_at :datetime not null
# updated_at :datetime not null
2014-02-07 11:07:36 +11:00
#
# Indexes
#
# index_topic_embeds_on_embed_url (embed_url) UNIQUE
#