diff --git a/app/models/topic_embed.rb b/app/models/topic_embed.rb index 99498db26..2be0005f8 100644 --- a/app/models/topic_embed.rb +++ b/app/models/topic_embed.rb @@ -10,6 +10,9 @@ class TopicEmbed < ActiveRecord::Base def self.import(user, url, title, contents) return unless url =~ /^https?\:\/\// + if SiteSetting.embed_truncate + contents = first_paragraph_from(contents) + end contents << "\n
\n#{I18n.t('embed.imported_from', link: "#{url}")}\n" embed = TopicEmbed.where(embed_url: url).first @@ -34,6 +37,7 @@ class TopicEmbed < ActiveRecord::Base end end else + absolutize_urls(url, contents) post = embed.post # Update the topic if it changed if content_sha1 != embed.content_sha1 @@ -63,7 +67,7 @@ class TopicEmbed < ActiveRecord::Base prefix = "#{uri.scheme}://#{uri.host}" prefix << ":#{uri.port}" if uri.port != 80 && uri.port != 443 - fragment = Nokogiri::HTML.fragment(contents) + fragment = Nokogiri::HTML.fragment("
#{contents}
") fragment.css('a').each do |a| href = a['href'] if href.present? && href.start_with?('/') @@ -76,14 +80,28 @@ class TopicEmbed < ActiveRecord::Base a['src'] = "#{prefix}/#{src.sub(/^\/+/, '')}" end end - - fragment.to_html + fragment.at('div').inner_html end def self.topic_id_for_embed(embed_url) TopicEmbed.where(embed_url: embed_url).pluck(:topic_id).first end + def self.first_paragraph_from(html) + doc = Nokogiri::HTML(html) + + result = "" + doc.css('p').each do |p| + if p.text.present? + result << p.to_s + return result if result.size >= 100 + end + end + return result unless result.blank? + + # If there is no first paragaph, return the first div (onebox) + doc.css('div').first + end end # == Schema Information diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index 0c5298f1b..c0d15c96c 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -49,7 +49,7 @@ en: loading: "Loading Discussion..." permalink: "Permalink" - imported_from: "Discussion topic for the original blog entry at: %{link}" + imported_from: "This is a companion discussion topic for the original blog entry at: %{link}" in_reply_to: "in reply to %{username}" replies: one: "1 reply" @@ -859,6 +859,7 @@ en: feed_polling_enabled: "Whether to import a RSS/ATOM feed as posts" feed_polling_url: "URL of RSS/ATOM feed to import" embed_by_username: "Discourse username of the user who creates the topics" + embed_truncate: "Truncate the imported posts" embed_category: "Category of created topics" embed_post_limit: "Maximum number of posts to embed" tos_accept_required: "If enabled, users will need to check a box on the signup form to confirm that they accept the terms of service. Edit 'Signup Form: Terms of Service Message' in the Content tab to change the message." diff --git a/config/site_settings.yml b/config/site_settings.yml index 1902b8b8f..1e77b9acd 100644 --- a/config/site_settings.yml +++ b/config/site_settings.yml @@ -403,6 +403,7 @@ embedding: embed_by_username: '' embed_category: '' embed_post_limit: 100 + embed_truncate: false legal: tos_url: diff --git a/lib/post_creator.rb b/lib/post_creator.rb index 29bfd6281..25de03753 100644 --- a/lib/post_creator.rb +++ b/lib/post_creator.rb @@ -209,6 +209,7 @@ class PostCreator end def rollback_if_host_spam_detected + return if @opts[:skip_validations] if @post.has_host_spam? @post.errors.add(:base, I18n.t(:spamming_host)) @errors = @post.errors diff --git a/lib/tasks/typepad.thor b/lib/tasks/typepad.thor index 0adc59ce3..71d3fa79f 100644 --- a/lib/tasks/typepad.thor +++ b/lib/tasks/typepad.thor @@ -1,14 +1,20 @@ +require 'open-uri' + class Typepad < Thor desc "import", "Imports posts from a Disqus XML export" method_option :file, aliases: '-f', required: true, desc: "The typepad file to import" method_option :dry_run, required: false, desc: "Just output what will be imported rather than doing it" method_option :post_as, aliases: '-p', required: true, desc: "The Discourse username to post as" + method_option :google_api, aliases: '-g', required: false, desc: "The google plus API key to use to fetch usernames" def import require './config/environment' - email_blacklist = SiteSetting.email_domains_blacklist + backup_settings = {} + %w(email_domains_blacklist).each do |s| + backup_settings[s] = SiteSetting.send(s) + end user = User.where(username_lower: options[:post_as].downcase).first if user.nil? @@ -24,44 +30,56 @@ class Typepad < Thor inside_block = true entry = "" - n = 0 entries = [] File.open(options[:file]).each_line do |l| l = l.scrub if l =~ /^--------$/ - entries << process_entry(entry) + parsed_entry = process_entry(entry) + if parsed_entry + puts "Parsed #{parsed_entry[:title]}" + entries << parsed_entry + end entry = "" else entry << l end - break if entries.size > 5 end - entries.compact! + entries.each_with_index do |e,i| + if e[:title] =~ /Head/ + puts "#{i}: #{e[:title]}" + end + end RateLimiter.disable - SiteSetting.email_domains_blacklist = "" - puts "import it" - puts entries.size - entries.each do |entry| + puts "Importing #{entries.size} entries" + entries.each_with_index do |entry, idx| + puts "Importing (#{idx+1}/#{entries.size})" + next if entry[:body].blank? + + puts entry[:unique_url] post = TopicEmbed.import(user, entry[:unique_url], entry[:title], entry[:body]) if post.present? + post.update_column(:created_at, entry[:date]) + post.topic.update_column(:created_at, entry[:date]) + post.topic.update_column(:bumped_at, entry[:date]) entry[:comments].each do |c| username = c[:author] - if c[:email].present? + + if c[:email].present? && c[:email] != "none@unknown.com" email = c[:email] post_user = User.where(email: email).first if post_user.blank? - post_user = User.create!(email: email, username: UserNameSuggester.suggest(username)) + post_user = User.create!(name: c[:name], email: email, username: UserNameSuggester.suggest(username)) end else - suggested = UserNameSuggester.suggest(username) - post_user = User.where(username: suggested) + post_user = User.where(username: username).first if post_user.blank? - post_user = User.create!(email: "#{suggested}@no-email-found.com", username: UserNameSuggester.suggest(username)) + suggested = UserNameSuggester.suggest(username) + post_user = User.create!(name: c[:name], email: "#{suggested}@no-email-found.com", username: suggested) end end @@ -69,16 +87,24 @@ class Typepad < Thor topic_id: post.topic_id, raw: c[:body], cooked: c[:body], - created_at: Time.now + created_at: c[:date], + skip_validations: true } - post = PostCreator.new(post_user, attrs).create + begin + post = PostCreator.new(post_user, attrs).create + puts post.errors.inspect if post.id.blank? + rescue => ex + puts "Error creating post: #{ex.inspect}" + end end end end ensure RateLimiter.enable - SiteSetting.email_domains_blacklist = email_blacklist + backup_settings.each do |s, v| + SiteSetting.send("#{s.to_s}=", v) + end end private @@ -92,14 +118,14 @@ class Typepad < Thor def parse_meta_data(section) result = {} section.split(/\n/).each do |l| - if l =~ /^([^:]+)\: (.*)$/ + if l =~ /^([A-Z\ ]+)\: (.*)$/ key, value = Regexp.last_match[1], Regexp.last_match[2] clean_type!(key) value.strip! result[key.to_sym] = value else result[:body] ||= "" - result[:body] << l + result[:body] << l << "\n" end end result @@ -123,6 +149,7 @@ class Typepad < Thor sections = entry.split(/-----/) entry = parse_meta_data(sections[0]).slice(:date, :title, :unique_url) entry[:comments] = [] + entry[:date] = entry[:date] ? DateTime.strptime(entry[:date], "%m/%d/%Y") : Time.now sections[1..-1].each do |s| type, value = parse_section(s) case type @@ -130,12 +157,90 @@ class Typepad < Thor entry[type] = value when :comment comment = parse_comment(value).slice(:author, :email, :url, :body, :date) + + if options[:google_api] && comment[:author] =~ /plus.google.com\/(\d+)/ + gplus_id = Regexp.last_match[1] + from_redis = $redis.get("gplus:#{gplus_id}") + if from_redis.blank? + json = ::JSON.parse(open("https://www.googleapis.com/plus/v1/people/#{gplus_id}?key=#{options[:google_api]}").read) + from_redis = json['displayName'] + $redis.set("gplus:#{gplus_id}", from_redis) + end + comment[:author] = from_redis + end + + if comment[:author] =~ /([^\.]+)\.wordpress\.com/ + comment[:author] = Regexp.last_match[1] + end + + if comment[:author] =~ /([^\.]+)\.blogspot\.com/ + comment[:author] = Regexp.last_match[1] + end + + if comment[:author] =~ /twitter.com\/([a-zA-Z0-9]+)/ + comment[:author] = Regexp.last_match[1] + end + + if comment[:author] =~ /www.facebook.com\/profile.php\?id=(\d+)/ + fb_id = Regexp.last_match[1] + from_redis = $redis.get("fb:#{fb_id}") + if from_redis.blank? + json = ::JSON.parse(open("http://graph.facebook.com/#{fb_id}").read) + from_redis = json['username'] + $redis.set("fb:#{fb_id}", from_redis) + end + comment[:author] = from_redis + end + + comment[:name] = comment[:author] + if comment[:author] + comment[:author].gsub!(/^[_\.]+/, '') + comment[:author].gsub!(/[_\.]+$/, '') + + if comment[:author].size < 12 + comment[:author].gsub!(/ /, '_') + else + segments = [] + current = "" + + last_upper = nil + comment[:author].each_char do |c| + is_upper = /[[:upper:]]/.match(c) + + if (current.size > 1 && is_upper != last_upper) + segments << current + current = "" + end + last_upper = is_upper + + if c == " " || c == "." || c == "_" || c == "-" + segments << current + current = "" + else + current << c + end + end + segments.delete_if {|s| s.nil? || s.size < 2} + segments << current + + comment[:author] = segments[0] + if segments.size > 1 && segments[1][0] =~ /[a-zA-Z]/ + comment[:author] << segments[1][0] + end + end + end + + comment[:author] = "commenter" if comment[:author].blank? + comment[:author] = "codinghorror" if comment[:author] == "Jeff Atwood" || comment[:author] == "JeffAtwood" || comment[:author] == "Jeff_Atwood" + + comment[:date] = comment[:date] ? DateTime.strptime(comment[:date], "%m/%d/%Y") : Time.now entry[:comments] << comment if comment[:body].present? end end entry[:title] && entry[:body] ? entry : nil end + end diff --git a/lib/user_name_suggester.rb b/lib/user_name_suggester.rb index 17f97885c..8c5fc8c85 100644 --- a/lib/user_name_suggester.rb +++ b/lib/user_name_suggester.rb @@ -19,8 +19,7 @@ module UserNameSuggester end def self.find_available_username_based_on(name) - sanitize_username!(name) - name = rightsize_username(name) + name = rightsize_username(sanitize_username!(name)) i = 1 attempt = name until User.username_available?(attempt) @@ -33,12 +32,15 @@ module UserNameSuggester end def self.sanitize_username!(name) + name = ActiveSupport::Inflector.transliterate(name) name.gsub!(/^[^[:alnum:]]+|\W+$/, "") name.gsub!(/\W+/, "_") + name.gsub!(/^\_+/, '') + name end def self.rightsize_username(name) name.ljust(User.username_length.begin, '1')[0, User.username_length.end] end -end \ No newline at end of file +end diff --git a/spec/components/user_name_suggester_spec.rb b/spec/components/user_name_suggester_spec.rb index a70be9afd..7cd0b16b7 100644 --- a/spec/components/user_name_suggester_spec.rb +++ b/spec/components/user_name_suggester_spec.rb @@ -22,6 +22,10 @@ describe UserNameSuggester do UserNameSuggester.suggest("Darth%^Vader").should == 'Darth_Vader' end + it "transliterates some characters" do + UserNameSuggester.suggest("Jørn").should == 'Jorn' + end + it 'adds 1 to an existing username' do user = Fabricate(:user) UserNameSuggester.suggest(user.username).should == "#{user.username}1"