Updated import for TypePad

This commit is contained in:
Robin Ward 2014-03-18 18:02:33 -04:00
parent 44888f7acc
commit d3f1eb395d
7 changed files with 158 additions and 26 deletions

View file

@ -10,6 +10,9 @@ class TopicEmbed < ActiveRecord::Base
def self.import(user, url, title, contents)
return unless url =~ /^https?\:\/\//
if SiteSetting.embed_truncate
contents = first_paragraph_from(contents)
end
contents << "\n<hr>\n<small>#{I18n.t('embed.imported_from', link: "<a href='#{url}'>#{url}</a>")}</small>\n"
embed = TopicEmbed.where(embed_url: url).first
@ -34,6 +37,7 @@ class TopicEmbed < ActiveRecord::Base
end
end
else
absolutize_urls(url, contents)
post = embed.post
# Update the topic if it changed
if content_sha1 != embed.content_sha1
@ -63,7 +67,7 @@ class TopicEmbed < ActiveRecord::Base
prefix = "#{uri.scheme}://#{uri.host}"
prefix << ":#{uri.port}" if uri.port != 80 && uri.port != 443
fragment = Nokogiri::HTML.fragment(contents)
fragment = Nokogiri::HTML.fragment("<div>#{contents}</div>")
fragment.css('a').each do |a|
href = a['href']
if href.present? && href.start_with?('/')
@ -76,14 +80,28 @@ class TopicEmbed < ActiveRecord::Base
a['src'] = "#{prefix}/#{src.sub(/^\/+/, '')}"
end
end
fragment.to_html
fragment.at('div').inner_html
end
def self.topic_id_for_embed(embed_url)
TopicEmbed.where(embed_url: embed_url).pluck(:topic_id).first
end
def self.first_paragraph_from(html)
doc = Nokogiri::HTML(html)
result = ""
doc.css('p').each do |p|
if p.text.present?
result << p.to_s
return result if result.size >= 100
end
end
return result unless result.blank?
# If there is no first paragaph, return the first div (onebox)
doc.css('div').first
end
end
# == Schema Information

View file

@ -49,7 +49,7 @@ en:
loading: "Loading Discussion..."
permalink: "Permalink"
imported_from: "Discussion topic for the original blog entry at: %{link}"
imported_from: "This is a companion discussion topic for the original blog entry at: %{link}"
in_reply_to: "in reply to %{username}"
replies:
one: "1 reply"
@ -859,6 +859,7 @@ en:
feed_polling_enabled: "Whether to import a RSS/ATOM feed as posts"
feed_polling_url: "URL of RSS/ATOM feed to import"
embed_by_username: "Discourse username of the user who creates the topics"
embed_truncate: "Truncate the imported posts"
embed_category: "Category of created topics"
embed_post_limit: "Maximum number of posts to embed"
tos_accept_required: "If enabled, users will need to check a box on the signup form to confirm that they accept the terms of service. Edit 'Signup Form: Terms of Service Message' in the Content tab to change the message."

View file

@ -403,6 +403,7 @@ embedding:
embed_by_username: ''
embed_category: ''
embed_post_limit: 100
embed_truncate: false
legal:
tos_url:

View file

@ -209,6 +209,7 @@ class PostCreator
end
def rollback_if_host_spam_detected
return if @opts[:skip_validations]
if @post.has_host_spam?
@post.errors.add(:base, I18n.t(:spamming_host))
@errors = @post.errors

View file

@ -1,14 +1,20 @@
require 'open-uri'
class Typepad < Thor
desc "import", "Imports posts from a Disqus XML export"
method_option :file, aliases: '-f', required: true, desc: "The typepad file to import"
method_option :dry_run, required: false, desc: "Just output what will be imported rather than doing it"
method_option :post_as, aliases: '-p', required: true, desc: "The Discourse username to post as"
method_option :google_api, aliases: '-g', required: false, desc: "The google plus API key to use to fetch usernames"
def import
require './config/environment'
email_blacklist = SiteSetting.email_domains_blacklist
backup_settings = {}
%w(email_domains_blacklist).each do |s|
backup_settings[s] = SiteSetting.send(s)
end
user = User.where(username_lower: options[:post_as].downcase).first
if user.nil?
@ -24,44 +30,56 @@ class Typepad < Thor
inside_block = true
entry = ""
n = 0
entries = []
File.open(options[:file]).each_line do |l|
l = l.scrub
if l =~ /^--------$/
entries << process_entry(entry)
parsed_entry = process_entry(entry)
if parsed_entry
puts "Parsed #{parsed_entry[:title]}"
entries << parsed_entry
end
entry = ""
else
entry << l
end
break if entries.size > 5
end
entries.compact!
entries.each_with_index do |e,i|
if e[:title] =~ /Head/
puts "#{i}: #{e[:title]}"
end
end
RateLimiter.disable
SiteSetting.email_domains_blacklist = ""
puts "import it"
puts entries.size
entries.each do |entry|
puts "Importing #{entries.size} entries"
entries.each_with_index do |entry, idx|
puts "Importing (#{idx+1}/#{entries.size})"
next if entry[:body].blank?
puts entry[:unique_url]
post = TopicEmbed.import(user, entry[:unique_url], entry[:title], entry[:body])
if post.present?
post.update_column(:created_at, entry[:date])
post.topic.update_column(:created_at, entry[:date])
post.topic.update_column(:bumped_at, entry[:date])
entry[:comments].each do |c|
username = c[:author]
if c[:email].present?
if c[:email].present? && c[:email] != "none@unknown.com"
email = c[:email]
post_user = User.where(email: email).first
if post_user.blank?
post_user = User.create!(email: email, username: UserNameSuggester.suggest(username))
post_user = User.create!(name: c[:name], email: email, username: UserNameSuggester.suggest(username))
end
else
suggested = UserNameSuggester.suggest(username)
post_user = User.where(username: suggested)
post_user = User.where(username: username).first
if post_user.blank?
post_user = User.create!(email: "#{suggested}@no-email-found.com", username: UserNameSuggester.suggest(username))
suggested = UserNameSuggester.suggest(username)
post_user = User.create!(name: c[:name], email: "#{suggested}@no-email-found.com", username: suggested)
end
end
@ -69,16 +87,24 @@ class Typepad < Thor
topic_id: post.topic_id,
raw: c[:body],
cooked: c[:body],
created_at: Time.now
created_at: c[:date],
skip_validations: true
}
post = PostCreator.new(post_user, attrs).create
begin
post = PostCreator.new(post_user, attrs).create
puts post.errors.inspect if post.id.blank?
rescue => ex
puts "Error creating post: #{ex.inspect}"
end
end
end
end
ensure
RateLimiter.enable
SiteSetting.email_domains_blacklist = email_blacklist
backup_settings.each do |s, v|
SiteSetting.send("#{s.to_s}=", v)
end
end
private
@ -92,14 +118,14 @@ class Typepad < Thor
def parse_meta_data(section)
result = {}
section.split(/\n/).each do |l|
if l =~ /^([^:]+)\: (.*)$/
if l =~ /^([A-Z\ ]+)\: (.*)$/
key, value = Regexp.last_match[1], Regexp.last_match[2]
clean_type!(key)
value.strip!
result[key.to_sym] = value
else
result[:body] ||= ""
result[:body] << l
result[:body] << l << "\n"
end
end
result
@ -123,6 +149,7 @@ class Typepad < Thor
sections = entry.split(/-----/)
entry = parse_meta_data(sections[0]).slice(:date, :title, :unique_url)
entry[:comments] = []
entry[:date] = entry[:date] ? DateTime.strptime(entry[:date], "%m/%d/%Y") : Time.now
sections[1..-1].each do |s|
type, value = parse_section(s)
case type
@ -130,12 +157,90 @@ class Typepad < Thor
entry[type] = value
when :comment
comment = parse_comment(value).slice(:author, :email, :url, :body, :date)
if options[:google_api] && comment[:author] =~ /plus.google.com\/(\d+)/
gplus_id = Regexp.last_match[1]
from_redis = $redis.get("gplus:#{gplus_id}")
if from_redis.blank?
json = ::JSON.parse(open("https://www.googleapis.com/plus/v1/people/#{gplus_id}?key=#{options[:google_api]}").read)
from_redis = json['displayName']
$redis.set("gplus:#{gplus_id}", from_redis)
end
comment[:author] = from_redis
end
if comment[:author] =~ /([^\.]+)\.wordpress\.com/
comment[:author] = Regexp.last_match[1]
end
if comment[:author] =~ /([^\.]+)\.blogspot\.com/
comment[:author] = Regexp.last_match[1]
end
if comment[:author] =~ /twitter.com\/([a-zA-Z0-9]+)/
comment[:author] = Regexp.last_match[1]
end
if comment[:author] =~ /www.facebook.com\/profile.php\?id=(\d+)/
fb_id = Regexp.last_match[1]
from_redis = $redis.get("fb:#{fb_id}")
if from_redis.blank?
json = ::JSON.parse(open("http://graph.facebook.com/#{fb_id}").read)
from_redis = json['username']
$redis.set("fb:#{fb_id}", from_redis)
end
comment[:author] = from_redis
end
comment[:name] = comment[:author]
if comment[:author]
comment[:author].gsub!(/^[_\.]+/, '')
comment[:author].gsub!(/[_\.]+$/, '')
if comment[:author].size < 12
comment[:author].gsub!(/ /, '_')
else
segments = []
current = ""
last_upper = nil
comment[:author].each_char do |c|
is_upper = /[[:upper:]]/.match(c)
if (current.size > 1 && is_upper != last_upper)
segments << current
current = ""
end
last_upper = is_upper
if c == " " || c == "." || c == "_" || c == "-"
segments << current
current = ""
else
current << c
end
end
segments.delete_if {|s| s.nil? || s.size < 2}
segments << current
comment[:author] = segments[0]
if segments.size > 1 && segments[1][0] =~ /[a-zA-Z]/
comment[:author] << segments[1][0]
end
end
end
comment[:author] = "commenter" if comment[:author].blank?
comment[:author] = "codinghorror" if comment[:author] == "Jeff Atwood" || comment[:author] == "JeffAtwood" || comment[:author] == "Jeff_Atwood"
comment[:date] = comment[:date] ? DateTime.strptime(comment[:date], "%m/%d/%Y") : Time.now
entry[:comments] << comment if comment[:body].present?
end
end
entry[:title] && entry[:body] ? entry : nil
end
end

View file

@ -19,8 +19,7 @@ module UserNameSuggester
end
def self.find_available_username_based_on(name)
sanitize_username!(name)
name = rightsize_username(name)
name = rightsize_username(sanitize_username!(name))
i = 1
attempt = name
until User.username_available?(attempt)
@ -33,8 +32,11 @@ module UserNameSuggester
end
def self.sanitize_username!(name)
name = ActiveSupport::Inflector.transliterate(name)
name.gsub!(/^[^[:alnum:]]+|\W+$/, "")
name.gsub!(/\W+/, "_")
name.gsub!(/^\_+/, '')
name
end
def self.rightsize_username(name)

View file

@ -22,6 +22,10 @@ describe UserNameSuggester do
UserNameSuggester.suggest("Darth%^Vader").should == 'Darth_Vader'
end
it "transliterates some characters" do
UserNameSuggester.suggest("Jørn").should == 'Jorn'
end
it 'adds 1 to an existing username' do
user = Fabricate(:user)
UserNameSuggester.suggest(user.username).should == "#{user.username}1"