From ab50d039bbf52a28a337db2c26ac139654e1eecf Mon Sep 17 00:00:00 2001 From: Robin Ward Date: Fri, 22 May 2015 15:40:26 -0400 Subject: [PATCH] Importer for nabble --- lib/email/receiver.rb | 5 +- script/import_scripts/nabble.rb | 146 ++++++++++++++++++++++++++++++++ 2 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 script/import_scripts/nabble.rb diff --git a/lib/email/receiver.rb b/lib/email/receiver.rb index 1fd7afe2b..4c1b5760b 100644 --- a/lib/email/receiver.rb +++ b/lib/email/receiver.rb @@ -24,8 +24,9 @@ module Email attr_reader :body, :email_log - def initialize(raw) + def initialize(raw, opts=nil) @raw = raw + @opts = opts || {} end def process @@ -135,6 +136,8 @@ module Email body = fix_charset message end + return body if @opts[:skip_sanity_check] + # Certain trigger phrases that means we didn't parse correctly if body =~ /Content\-Type\:/ || body =~ /multipart\/alternative/ || body =~ /text\/plain/ raise EmptyEmailError diff --git a/script/import_scripts/nabble.rb b/script/import_scripts/nabble.rb new file mode 100644 index 000000000..c975b5cf1 --- /dev/null +++ b/script/import_scripts/nabble.rb @@ -0,0 +1,146 @@ +require File.expand_path(File.dirname(__FILE__) + "/base.rb") +require 'pg' + +class ImportScripts::MyAskBot < ImportScripts::Base + # CHANGE THESE BEFORE RUNNING THE IMPORTER + + BATCH_SIZE = 1000 + + DB_NAME = "nabble" + CATEGORY_ID = 6 + + def initialize + super + + @tagmap = [] + @td = PG::TextDecoder::TimestampWithTimeZone.new + @client = PG.connect(dbname: DB_NAME) + end + + def execute + import_users + create_forum_topics + import_replies + end + + def import_users + puts "", "importing users" + + total_count = @client.exec("SELECT COUNT(user_id) FROM user_")[0]["count"] + + batches(BATCH_SIZE) do |offset| + users = @client.query(<<-SQL + SELECT user_id, name, email, joined + FROM user_ + ORDER BY joined + LIMIT #{BATCH_SIZE} + OFFSET #{offset} + SQL + ) + + break if users.ntuples() < 1 + + create_users(users, total: total_count, offset: offset) do |user| + { + id: user["user_id"], + email: user["email"] || (SecureRandom.hex << "@domain.com"), + created_at: Time.zone.at(@td.decode(user["joined"])), + name: user["name"] + } + end + end + end + + def parse_email(msg) + receiver = Email::Receiver.new(msg, skip_sanity_check: true) + mail = Mail.read_from_string(msg) + receiver.parse_body(mail) + end + + def create_forum_topics + puts "", "creating forum topics" + + app_node_id = @client.exec("SELECT node_id FROM node WHERE is_app LIMIT 1")[0]['node_id'] + topic_count = @client.exec("SELECT COUNT(node_id) AS count FROM node WHERE parent_id = #{app_node_id}")[0]["count"] + + batches(BATCH_SIZE) do |offset| + + topics = @client.exec <<-SQL + SELECT n.node_id, n.subject, n.owner_id, n.when_created, nm.message, n.msg_fmt + FROM node AS n + INNER JOIN node_msg AS nm ON nm.node_id = n.node_id + WHERE n.parent_id = #{app_node_id} + ORDER BY n.when_created + LIMIT #{BATCH_SIZE} + OFFSET #{offset} + SQL + + break if topics.ntuples() < 1 + + create_posts(topics, total: topic_count, offset: offset) do |t| + raw = body_from(t) + next unless raw + + { id: t['node_id'], + title: t['subject'], + user_id: user_id_from_imported_user_id(t["owner_id"]) || Discourse::SYSTEM_USER_ID, + created_at: Time.zone.at(@td.decode(t["when_created"])), + category: CATEGORY_ID, + raw: raw } + end + end + end + + def body_from(p) + %w(m s).include?(p['msg_fmt']) ? parse_email(p['message']) : p['message'] + rescue Email::Receiver::EmptyEmailError + puts "Skipped #{p['node_id']}" + end + + def import_replies + puts "", "creating topic replies" + + app_node_id = @client.exec("SELECT node_id FROM node WHERE is_app LIMIT 1")[0]['node_id'] + post_count = @client.exec("SELECT COUNT(node_id) AS count FROM node WHERE parent_id != #{app_node_id}")[0]["count"] + + topic_ids = {} + + batches(BATCH_SIZE) do |offset| + posts = @client.exec <<-SQL + SELECT n.node_id, n.parent_id, n.subject, n.owner_id, n.when_created, nm.message, n.msg_fmt + FROM node AS n + INNER JOIN node_msg AS nm ON nm.node_id = n.node_id + WHERE n.parent_id != #{app_node_id} + ORDER BY n.when_created + LIMIT #{BATCH_SIZE} + OFFSET #{offset} + SQL + + break if posts.ntuples() < 1 + + create_posts(posts, total: post_count, offset: offset) do |p| + parent_id = p['parent_id'] + id = p['node_id'] + + topic_id = topic_ids[parent_id] + unless topic_id + topic = topic_lookup_from_imported_post_id(parent_id) + topic_id = topic[:topic_id] if topic + end + next unless topic_id + + topic_ids[id] = topic_id + + raw = body_from(p) + next unless raw + { id: id, + topic_id: topic_id, + user_id: user_id_from_imported_user_id(p['owner_id']) || Discourse::SYSTEM_USER_ID, + created_at: Time.zone.at(@td.decode(p["when_created"])), + raw: raw } + end + end + end +end + +ImportScripts::MyAskBot.new.perform