From 3b35972d254876971c39e2b53c9fff73f3221a41 Mon Sep 17 00:00:00 2001 From: Robin Ward Date: Thu, 22 Oct 2015 15:02:53 -0400 Subject: [PATCH] Improvements to importing a mailing list --- script/import_scripts/mbox.rb | 52 ++++++++++++++++++++++++++++------- 1 file changed, 42 insertions(+), 10 deletions(-) diff --git a/script/import_scripts/mbox.rb b/script/import_scripts/mbox.rb index d47b5553f..3ff41e945 100755 --- a/script/import_scripts/mbox.rb +++ b/script/import_scripts/mbox.rb @@ -37,6 +37,7 @@ class ImportScripts::Mbox < ImportScripts::Base topics = [] topic_lookup = {} + topic_titles = {} replies = [] all_messages do |mail, filename| @@ -44,21 +45,55 @@ class ImportScripts::Mbox < ImportScripts::Base msg_id = mail['Message-ID'].to_s reply_to = mail['In-Reply-To'].to_s + title = clean_title(mail['Subject'].to_s) if reply_to.present? topic = topic_lookup[reply_to] || reply_to topic_lookup[msg_id] = topic - replies << {id: msg_id, topic: topic, file: filename} + replies << {id: msg_id, topic: topic, file: filename, title: title} else - topics << {id: msg_id, file: filename} + topics << {id: msg_id, file: filename, title: title} + topic_titles[title] ||= msg_id end end + # Replies without parents should be hoisted to topics + to_hoist = [] + replies.each do |r| + to_hoist << r if !topic_lookup[r[:topic]] + end + + to_hoist.each do |h| + replies.delete(h) + topics << {id: h[:id], file: h[:file], title: h[:title]} + topic_titles[h[:title]] ||= h[:id] + end + + # Topics with duplicate replies should be replies + to_group = [] + topics.each do |t| + first = topic_titles[t[:title]] + to_group << t if first && first != t[:id] + end + + to_group.each do |t| + topics.delete(t) + replies << {id: t[:id], topic: topic_titles[t[:title]], file: t[:file], title: t[:title]} + end + File.write(USER_INDEX_PATH, {users: users}.to_json) File.write(TOPIC_INDEX_PATH, {topics: topics}.to_json) File.write(REPLY_INDEX_PATH, {replies: replies}.to_json) end + def clean_title(title) + title.gsub(/^Re: */i, '') + end + + def clean_raw(raw) + raw.gsub(/-- \nYou received this message because you are subscribed to the Google Groups "[^"]*" group.\nTo unsubscribe from this group and stop receiving emails from it, send an email to [^+@]+\+unsubscribe@googlegroups.com\.\nFor more options, visit https:\/\/groups\.google\.com\/groups\/opt_out\./, '') + end + def import_users puts "", "importing users" @@ -100,7 +135,7 @@ class ImportScripts::Mbox < ImportScripts::Base topics = all_topics[offset..offset+BATCH_SIZE-1] break if topics.nil? - next if all_records_exist? :posts, topics.map {|t| t['id'].to_i} + next if all_records_exist? :posts, topics.map {|t| t['id']} create_posts(topics, total: topic_count, offset: offset) do |t| raw_email = File.read(t['file']) @@ -116,11 +151,11 @@ class ImportScripts::Mbox < ImportScripts::Base title = mail.subject.gsub(/\[[^\]]+\]+/, '').strip { id: t['id'], - title: title, + title: clean_title(title), user_id: user_id_from_imported_user_id(mail.from.first) || Discourse::SYSTEM_USER_ID, created_at: mail.date, category: CATEGORY_ID, - raw: raw, + raw: clean_raw(raw), cook_method: Post.cook_methods[:email] } end end @@ -129,9 +164,6 @@ class ImportScripts::Mbox < ImportScripts::Base def import_replies puts "", "creating topic replies" - all_topics = ::JSON.parse(File.read(TOPIC_INDEX_PATH))['topics'] - topic_count = all_topics.size - replies = ::JSON.parse(File.read(REPLY_INDEX_PATH))['replies'] post_count = replies.size @@ -139,7 +171,7 @@ class ImportScripts::Mbox < ImportScripts::Base posts = replies[offset..offset+BATCH_SIZE-1] break if posts.nil? - next if all_records_exist? :posts, posts.map {|p| p['id'].to_i} + next if all_records_exist? :posts, posts.map {|p| p['id']} create_posts(posts, total: post_count, offset: offset) do |p| parent_id = p['topic'] @@ -161,7 +193,7 @@ class ImportScripts::Mbox < ImportScripts::Base topic_id: topic_id, user_id: user_id_from_imported_user_id(mail.from.first) || Discourse::SYSTEM_USER_ID, created_at: mail.date, - raw: raw, + raw: clean_raw(raw), cook_method: Post.cook_methods[:email] } end end