discourse/app/jobs/scheduled/poll_feed.rb
Matt Palmer 394cd43d77 Scrub only after converting strings to UTF-8
Scrubbing an ASCII-8BIT string isn't ever going to remove anything, because
there's no code point that isn't valid 8-bit ASCII.  Since we'd really
prefer it if everything were UTF-8 anyway, we'll just assume, for now, that
whatever comes out of SimpleRSS is probably UTF-8, and just nuke anything
that isn't a valid UTF-8 codepoint.

Of course, the *real* bug here is that SimpleRSS [unilaterally converts
everything to
ASCII-8BIT](https://github.com/cardmagic/simple-rss/issues/15).  It's
presumably *far* too much to ask that it detects the encoding of the source
RSS feed and marks the parsed strings with the correct encoding...
2016-08-25 16:09:07 +10:00

136 lines
2.8 KiB
Ruby

#
# Creates and Updates Topics based on an RSS or ATOM feed.
#
require 'digest/sha1'
require_dependency 'post_creator'
require_dependency 'post_revisor'
require 'open-uri'
module Jobs
class PollFeed < Jobs::Scheduled
every 1.hour
sidekiq_options retry: false
def execute(args)
poll_feed if SiteSetting.feed_polling_enabled? &&
SiteSetting.feed_polling_url.present?
end
def feed_key
@feed_key ||= "feed-modified:#{Digest::SHA1.hexdigest(SiteSetting.feed_polling_url)}"
end
def poll_feed
feed = Feed.new
import_topics(feed.topics)
end
private
def import_topics(feed_topics)
feed_topics.each do |topic|
import_topic(topic)
end
end
def import_topic(topic)
if topic.user
TopicEmbed.import(topic.user, topic.url, topic.title, CGI.unescapeHTML(topic.content.scrub))
end
end
class Feed
require 'simple-rss'
if SiteSetting.embed_username_key_from_feed.present?
SimpleRSS.item_tags << SiteSetting.embed_username_key_from_feed.to_sym
end
def initialize
@feed_url = SiteSetting.feed_polling_url
@feed_url = "http://#{@feed_url}" if @feed_url !~ /^https?\:\/\//
end
def topics
feed_topics = []
rss.items.each do |i|
current_feed_topic = FeedTopic.new(i)
feed_topics << current_feed_topic if current_feed_topic.content
end
return feed_topics
end
private
def rss
SimpleRSS.parse open(@feed_url, allow_redirections: :all)
end
end
class FeedTopic
def initialize(article_rss_item)
@article_rss_item = article_rss_item
end
def url
link = @article_rss_item.link
if url?(link)
return link
else
return @article_rss_item.id
end
end
def content
@article_rss_item.content.try(:force_encoding, "UTF-8").try(:scrub) || @article_rss_item.description.try(:force_encoding, "UTF-8").try(:scrub)
end
def title
@article_rss_item.title.force_encoding("UTF-8").scrub
end
def user
author_user || default_user
end
private
def url?(link)
if link.blank? || link !~ /^https?\:\/\//
return false
else
return true
end
end
def author_username
begin
@article_rss_item.send(SiteSetting.embed_username_key_from_feed.to_sym)
rescue
nil
end
end
def default_user
find_user(SiteSetting.embed_by_username.downcase)
end
def author_user
return nil if !author_username.present?
find_user(author_username)
end
def find_user(user_name)
User.where(username_lower: user_name).first
end
end
end
end