Improvements to importing a mailing list

This commit is contained in:
Robin Ward 2015-10-22 15:02:53 -04:00
parent 0ea54e9255
commit 3b35972d25

View file

@ -37,6 +37,7 @@ class ImportScripts::Mbox < ImportScripts::Base
topics = [] topics = []
topic_lookup = {} topic_lookup = {}
topic_titles = {}
replies = [] replies = []
all_messages do |mail, filename| all_messages do |mail, filename|
@ -44,21 +45,55 @@ class ImportScripts::Mbox < ImportScripts::Base
msg_id = mail['Message-ID'].to_s msg_id = mail['Message-ID'].to_s
reply_to = mail['In-Reply-To'].to_s reply_to = mail['In-Reply-To'].to_s
title = clean_title(mail['Subject'].to_s)
if reply_to.present? if reply_to.present?
topic = topic_lookup[reply_to] || reply_to topic = topic_lookup[reply_to] || reply_to
topic_lookup[msg_id] = topic topic_lookup[msg_id] = topic
replies << {id: msg_id, topic: topic, file: filename} replies << {id: msg_id, topic: topic, file: filename, title: title}
else else
topics << {id: msg_id, file: filename} topics << {id: msg_id, file: filename, title: title}
topic_titles[title] ||= msg_id
end end
end end
# Replies without parents should be hoisted to topics
to_hoist = []
replies.each do |r|
to_hoist << r if !topic_lookup[r[:topic]]
end
to_hoist.each do |h|
replies.delete(h)
topics << {id: h[:id], file: h[:file], title: h[:title]}
topic_titles[h[:title]] ||= h[:id]
end
# Topics with duplicate replies should be replies
to_group = []
topics.each do |t|
first = topic_titles[t[:title]]
to_group << t if first && first != t[:id]
end
to_group.each do |t|
topics.delete(t)
replies << {id: t[:id], topic: topic_titles[t[:title]], file: t[:file], title: t[:title]}
end
File.write(USER_INDEX_PATH, {users: users}.to_json) File.write(USER_INDEX_PATH, {users: users}.to_json)
File.write(TOPIC_INDEX_PATH, {topics: topics}.to_json) File.write(TOPIC_INDEX_PATH, {topics: topics}.to_json)
File.write(REPLY_INDEX_PATH, {replies: replies}.to_json) File.write(REPLY_INDEX_PATH, {replies: replies}.to_json)
end end
def clean_title(title)
title.gsub(/^Re: */i, '')
end
def clean_raw(raw)
raw.gsub(/-- \nYou received this message because you are subscribed to the Google Groups "[^"]*" group.\nTo unsubscribe from this group and stop receiving emails from it, send an email to [^+@]+\+unsubscribe@googlegroups.com\.\nFor more options, visit https:\/\/groups\.google\.com\/groups\/opt_out\./, '')
end
def import_users def import_users
puts "", "importing users" puts "", "importing users"
@ -100,7 +135,7 @@ class ImportScripts::Mbox < ImportScripts::Base
topics = all_topics[offset..offset+BATCH_SIZE-1] topics = all_topics[offset..offset+BATCH_SIZE-1]
break if topics.nil? break if topics.nil?
next if all_records_exist? :posts, topics.map {|t| t['id'].to_i} next if all_records_exist? :posts, topics.map {|t| t['id']}
create_posts(topics, total: topic_count, offset: offset) do |t| create_posts(topics, total: topic_count, offset: offset) do |t|
raw_email = File.read(t['file']) raw_email = File.read(t['file'])
@ -116,11 +151,11 @@ class ImportScripts::Mbox < ImportScripts::Base
title = mail.subject.gsub(/\[[^\]]+\]+/, '').strip title = mail.subject.gsub(/\[[^\]]+\]+/, '').strip
{ id: t['id'], { id: t['id'],
title: title, title: clean_title(title),
user_id: user_id_from_imported_user_id(mail.from.first) || Discourse::SYSTEM_USER_ID, user_id: user_id_from_imported_user_id(mail.from.first) || Discourse::SYSTEM_USER_ID,
created_at: mail.date, created_at: mail.date,
category: CATEGORY_ID, category: CATEGORY_ID,
raw: raw, raw: clean_raw(raw),
cook_method: Post.cook_methods[:email] } cook_method: Post.cook_methods[:email] }
end end
end end
@ -129,9 +164,6 @@ class ImportScripts::Mbox < ImportScripts::Base
def import_replies def import_replies
puts "", "creating topic replies" puts "", "creating topic replies"
all_topics = ::JSON.parse(File.read(TOPIC_INDEX_PATH))['topics']
topic_count = all_topics.size
replies = ::JSON.parse(File.read(REPLY_INDEX_PATH))['replies'] replies = ::JSON.parse(File.read(REPLY_INDEX_PATH))['replies']
post_count = replies.size post_count = replies.size
@ -139,7 +171,7 @@ class ImportScripts::Mbox < ImportScripts::Base
posts = replies[offset..offset+BATCH_SIZE-1] posts = replies[offset..offset+BATCH_SIZE-1]
break if posts.nil? break if posts.nil?
next if all_records_exist? :posts, posts.map {|p| p['id'].to_i} next if all_records_exist? :posts, posts.map {|p| p['id']}
create_posts(posts, total: post_count, offset: offset) do |p| create_posts(posts, total: post_count, offset: offset) do |p|
parent_id = p['topic'] parent_id = p['topic']
@ -161,7 +193,7 @@ class ImportScripts::Mbox < ImportScripts::Base
topic_id: topic_id, topic_id: topic_id,
user_id: user_id_from_imported_user_id(mail.from.first) || Discourse::SYSTEM_USER_ID, user_id: user_id_from_imported_user_id(mail.from.first) || Discourse::SYSTEM_USER_ID,
created_at: mail.date, created_at: mail.date,
raw: raw, raw: clean_raw(raw),
cook_method: Post.cook_methods[:email] } cook_method: Post.cook_methods[:email] }
end end
end end