2016-06-14 11:44:35 -04:00
require 'sqlite3'
2015-07-23 18:37:40 -04:00
require File . expand_path ( File . dirname ( __FILE__ ) + " /base.rb " )
class ImportScripts :: Mbox < ImportScripts :: Base
# CHANGE THESE BEFORE RUNNING THE IMPORTER
BATCH_SIZE = 1000
2016-06-14 11:44:35 -04:00
MBOX_DIR = File . expand_path ( " ~/import/site " )
# Remove to not split individual files
2016-06-20 14:58:53 -04:00
SPLIT_AT = / ^From (.*) at /
2015-07-23 18:37:40 -04:00
2016-06-28 16:35:19 -04:00
# Will create a category if it doesn't exist
CATEGORY_MAPPINGS = {
" default " = > " uncategorized " ,
# ex: "jobs-folder" => "jobs"
}
2015-07-23 18:37:40 -04:00
def execute
2016-06-28 16:35:19 -04:00
import_categories
2016-06-14 11:44:35 -04:00
create_email_indices
create_user_indices
massage_indices
2015-07-23 18:37:40 -04:00
import_users
create_forum_topics
import_replies
end
2016-06-28 16:35:19 -04:00
def import_categories
mappings = CATEGORY_MAPPINGS . values - [ 'uncategorized' ]
create_categories ( mappings ) do | c |
{ id : c , name : c }
end
end
2016-06-14 11:44:35 -04:00
def open_db
SQLite3 :: Database . new ( " #{ MBOX_DIR } /index.db " )
end
2015-07-23 18:37:40 -04:00
2016-06-20 12:41:57 -04:00
def each_line ( f )
infile = File . open ( f , 'r' )
if f . ends_with? ( '.gz' )
gz = Zlib :: GzipReader . new ( infile )
gz . each_line do | line |
yield line
end
else
infile . each_line do | line |
yield line
end
end
ensure
infile . close
end
2016-06-14 11:44:35 -04:00
def all_messages
files = Dir [ " #{ MBOX_DIR } /messages/* " ]
2015-07-23 18:37:40 -04:00
2016-06-28 16:35:19 -04:00
CATEGORY_MAPPINGS . keys . each do | k |
files << Dir [ " #{ MBOX_DIR } / #{ k } /* " ]
end
files . flatten!
2015-07-23 18:37:40 -04:00
files . each_with_index do | f , idx |
2016-06-14 11:44:35 -04:00
if SPLIT_AT . present?
msg = " "
2016-06-20 12:41:57 -04:00
each_line ( f ) do | line |
2016-06-14 11:44:35 -04:00
line = line . scrub
if line =~ SPLIT_AT
if ! msg . empty?
mail = Mail . read_from_string ( msg )
2016-06-28 16:35:19 -04:00
yield mail , f
2016-06-14 11:44:35 -04:00
print_status ( idx , files . size )
msg = " "
end
end
msg << line
end
2016-06-20 14:58:53 -04:00
2016-06-14 11:44:35 -04:00
if ! msg . empty?
mail = Mail . read_from_string ( msg )
2016-06-28 16:35:19 -04:00
yield mail , f
2016-06-14 11:44:35 -04:00
print_status ( idx , files . size )
msg = " "
end
else
raw = File . read ( f )
mail = Mail . read_from_string ( raw )
2016-06-28 16:35:19 -04:00
yield mail , f
2016-06-14 11:44:35 -04:00
print_status ( idx , files . size )
end
2015-07-23 18:37:40 -04:00
end
end
2016-06-14 11:44:35 -04:00
def massage_indices
db = open_db
db . execute " UPDATE emails SET reply_to = null WHERE reply_to = '' "
2015-07-23 18:37:40 -04:00
2016-08-26 12:47:03 -04:00
rows = db . execute " SELECT msg_id, title, reply_to FROM emails ORDER BY datetime(email_date) ASC "
2015-07-23 18:37:40 -04:00
2016-06-14 11:44:35 -04:00
msg_ids = { }
titles = { }
rows . each do | row |
msg_ids [ row [ 0 ] ] = true
titles [ row [ 1 ] ] = row [ 0 ]
end
2015-07-23 18:37:40 -04:00
2016-06-14 11:44:35 -04:00
# First, any replies where the parent doesn't exist should have that field cleared
not_found = [ ]
rows . each do | row |
msg_id , _ , reply_to = row
2015-07-23 18:37:40 -04:00
if reply_to . present?
2016-06-14 11:44:35 -04:00
not_found << msg_id if msg_ids [ reply_to ] . blank?
2015-07-23 18:37:40 -04:00
end
end
2016-06-14 11:44:35 -04:00
puts " #{ not_found . size } records couldn't be associated with parents "
if not_found . present?
db . execute " UPDATE emails SET reply_to = NULL WHERE msg_id IN ( #{ not_found . map { | nf | " ' #{ nf } ' " } . join ( ',' ) } ) "
2015-10-22 15:02:53 -04:00
end
2016-06-14 11:44:35 -04:00
dupe_titles = db . execute " SELECT title, COUNT(*) FROM emails GROUP BY title HAVING count(*) > 1 "
puts " #{ dupe_titles . size } replies to wire up "
dupe_titles . each do | t |
title = t [ 0 ]
first = titles [ title ]
db . execute " UPDATE emails SET reply_to = ? WHERE title = ? and msg_id <> ? " , [ first , title , first ]
2015-10-22 15:02:53 -04:00
end
2016-06-14 11:44:35 -04:00
ensure
db . close
end
2015-10-22 15:02:53 -04:00
2016-06-20 14:58:53 -04:00
def extract_name ( mail )
from_name = nil
from = mail [ :from ]
from_email = nil
if mail . from . present?
from_email = mail . from . dup
if from_email . kind_of? ( Array )
2016-06-24 13:59:01 -04:00
from_email = from_email . first . dup
2016-06-20 14:58:53 -04:00
end
from_email . gsub! ( / at / , '@' )
from_email . gsub! ( / \ (.*$ / , '' )
end
display_names = from . try ( :display_names )
if display_names . present?
from_name = display_names . first
end
if from_name . blank? && from . to_s =~ / \ (([^ \ )]+) \ ) /
from_name = Regexp . last_match [ 1 ]
end
from_name = from . to_s if from_name . blank?
[ from_email , from_name ]
end
2016-06-14 11:44:35 -04:00
def create_email_indices
db = open_db
db . execute " DROP TABLE IF EXISTS emails "
db . execute <<-SQL
CREATE TABLE emails (
msg_id VARCHAR ( 995 ) PRIMARY KEY ,
from_email VARCHAR ( 255 ) NOT NULL ,
from_name VARCHAR ( 255 ) NOT NULL ,
title VARCHAR ( 255 ) NOT NULL ,
reply_to VARCHAR ( 955 ) NULL ,
email_date DATETIME NOT NULL ,
2016-06-28 16:35:19 -04:00
message TEXT NOT NULL ,
category VARCHAR ( 255 ) NOT NULL
2016-06-14 11:44:35 -04:00
) ;
SQL
db . execute " CREATE INDEX by_title ON emails (title) "
db . execute " CREATE INDEX by_email ON emails (from_email) "
puts " " , " creating indices "
2016-06-28 16:35:19 -04:00
all_messages do | mail , filename |
directory = filename . sub ( " #{ MBOX_DIR } / " , '' ) . split ( " / " ) [ 0 ]
category = CATEGORY_MAPPINGS [ directory ] || CATEGORY_MAPPINGS [ 'default' ] || 'uncategorized'
2016-06-14 11:44:35 -04:00
msg_id = mail [ 'Message-ID' ] . to_s
# Many ways to get a name
2016-06-20 14:58:53 -04:00
from_email , from_name = extract_name ( mail )
2015-10-27 14:21:29 -04:00
2016-06-14 11:44:35 -04:00
title = clean_title ( mail [ 'Subject' ] . to_s )
reply_to = mail [ 'In-Reply-To' ] . to_s
email_date = mail [ 'date' ] . to_s
2016-08-26 12:47:03 -04:00
email_date = DateTime . parse ( email_date ) . to_s unless email_date . blank?
2016-06-14 11:44:35 -04:00
2016-06-28 16:35:19 -04:00
db . execute " INSERT OR IGNORE INTO emails (msg_id,
from_email ,
from_name ,
title ,
reply_to ,
email_date ,
message ,
category )
VALUES ( ?, ?, ?, ?, ?, ?, ?, ?) " ,
[ msg_id , from_email , from_name , title , reply_to , email_date , mail . to_s , category ]
2016-06-14 11:44:35 -04:00
end
ensure
db . close
end
2015-10-27 14:21:29 -04:00
2016-06-14 11:44:35 -04:00
def create_user_indices
db = open_db
db . execute " DROP TABLE IF EXISTS users "
db . execute <<-SQL
CREATE TABLE users (
email VARCHAR ( 995 ) PRIMARY KEY ,
name VARCHAR ( 255 ) NOT NULL
) ;
SQL
db . execute " INSERT OR IGNORE INTO users (email, name) SELECT from_email, from_name FROM emails "
ensure
db . close
2015-07-23 18:37:40 -04:00
end
2015-10-22 15:02:53 -04:00
def clean_title ( title )
2016-06-14 11:44:35 -04:00
title || = " "
2016-03-24 11:05:10 -04:00
#Strip mailing list name from subject
title = title . gsub ( / \ [[^ \ ]]+ \ ]+ / , '' ) . strip
original_length = title . length
#Strip Reply prefix from title (Standard and localized)
title = title . gsub ( / ^Re: * /i , '' )
title = title . gsub ( / ^R: * /i , '' ) #Italian
title = title . gsub ( / ^RIF: * /i , '' ) #Italian
#Strip Forward prefix from title (Standard and localized)
title = title . gsub ( / ^Fwd: * /i , '' )
title = title . gsub ( / ^I: * /i , '' ) #Italian
title . strip
#In case of mixed localized prefixes there could be many of them if the mail client didn't strip the localized ones
if original_length > title . length
clean_title ( title )
else
title
end
2015-10-22 15:02:53 -04:00
end
2016-06-20 14:58:53 -04:00
def clean_raw ( input )
raw = input . dup
raw . gsub! ( / -- \ nYou received this message because you are subscribed to the Google Groups "[^"]*" group. \ nTo unsubscribe from this group and stop receiving emails from it, send an email to [^+@]+ \ +unsubscribe@googlegroups.com \ . \ nFor more options, visit https: \/ \/ groups \ .google \ .com \/ groups \/ opt_out \ . / , '' )
raw
2015-10-22 15:02:53 -04:00
end
2015-07-23 18:37:40 -04:00
def import_users
puts " " , " importing users "
2016-06-14 11:44:35 -04:00
db = open_db
2015-07-23 18:37:40 -04:00
2016-06-14 11:44:35 -04:00
all_users = db . execute ( " SELECT name, email FROM users " )
total_count = all_users . size
2015-07-23 18:37:40 -04:00
batches ( BATCH_SIZE ) do | offset |
2016-06-14 11:44:35 -04:00
users = all_users [ offset .. offset + BATCH_SIZE - 1 ]
2015-07-23 18:37:40 -04:00
break if users . nil?
2016-06-14 11:44:35 -04:00
next if all_records_exist? :users , users . map { | u | u [ 1 ] }
2015-07-23 18:37:40 -04:00
2016-06-14 11:44:35 -04:00
create_users ( users , total : total_count , offset : offset ) do | u |
2015-07-23 18:37:40 -04:00
{
2016-06-14 11:44:35 -04:00
id : u [ 1 ] ,
email : u [ 1 ] ,
name : u [ 0 ]
2015-07-23 18:37:40 -04:00
}
end
end
2016-06-14 11:44:35 -04:00
ensure
db . close
2015-07-23 18:37:40 -04:00
end
def parse_email ( msg )
2016-03-07 17:15:57 -05:00
receiver = Email :: Receiver . new ( msg )
2015-07-23 18:37:40 -04:00
mail = Mail . read_from_string ( msg )
mail . body
2016-03-07 17:15:57 -05:00
selected = receiver . select_body
2015-07-23 18:37:40 -04:00
selected . force_encoding ( selected . encoding ) . encode ( " UTF-8 " )
end
def create_forum_topics
puts " " , " creating forum topics "
2016-06-14 11:44:35 -04:00
db = open_db
all_topics = db . execute ( " SELECT msg_id,
from_email ,
from_name ,
title ,
email_date ,
2016-06-28 16:35:19 -04:00
message ,
category
2016-06-14 11:44:35 -04:00
FROM emails
WHERE reply_to IS NULL " )
2015-07-23 18:37:40 -04:00
topic_count = all_topics . size
batches ( BATCH_SIZE ) do | offset |
topics = all_topics [ offset .. offset + BATCH_SIZE - 1 ]
break if topics . nil?
2016-06-14 11:44:35 -04:00
next if all_records_exist? :posts , topics . map { | t | t [ 0 ] }
2015-09-21 19:48:42 -04:00
2015-07-23 18:37:40 -04:00
create_posts ( topics , total : topic_count , offset : offset ) do | t |
2016-06-14 11:44:35 -04:00
raw_email = t [ 5 ]
2016-03-07 17:15:57 -05:00
receiver = Email :: Receiver . new ( raw_email )
2015-07-23 18:37:40 -04:00
mail = Mail . read_from_string ( raw_email )
mail . body
2016-06-20 14:58:53 -04:00
from_email , _ = extract_name ( mail )
2016-03-07 17:15:57 -05:00
selected = receiver . select_body
2015-07-23 18:37:40 -04:00
next unless selected
2016-06-14 11:44:35 -04:00
selected = selected . join ( '' ) if selected . kind_of? ( Array )
2015-07-23 18:37:40 -04:00
raw = selected . force_encoding ( selected . encoding ) . encode ( " UTF-8 " )
2016-03-24 11:05:10 -04:00
title = mail . subject
2015-07-23 18:37:40 -04:00
2016-03-24 11:05:58 -04:00
# import the attachments
mail . attachments . each do | attachment |
tmp = Tempfile . new ( " discourse-email-attachment " )
begin
# read attachment
File . open ( tmp . path , " w+b " ) { | f | f . write attachment . body . decoded }
# create the upload for the user
2016-06-20 14:58:53 -04:00
upload = Upload . create_for ( user_id_from_imported_user_id ( from_email ) || Discourse :: SYSTEM_USER_ID , tmp , attachment . filename , tmp . size )
2016-03-24 11:05:58 -04:00
if upload && upload . errors . empty?
raw << " \n \n #{ receiver . attachment_markdown ( upload ) } \n \n "
end
ensure
tmp . try ( :close! ) rescue nil
end
end
2016-06-14 11:44:35 -04:00
{ id : t [ 0 ] ,
2015-10-22 15:02:53 -04:00
title : clean_title ( title ) ,
2016-06-20 14:58:53 -04:00
user_id : user_id_from_imported_user_id ( from_email ) || Discourse :: SYSTEM_USER_ID ,
2015-07-23 18:37:40 -04:00
created_at : mail . date ,
2016-06-28 16:35:19 -04:00
category : t [ 6 ] ,
2015-10-22 15:02:53 -04:00
raw : clean_raw ( raw ) ,
2015-07-23 18:37:40 -04:00
cook_method : Post . cook_methods [ :email ] }
end
end
2016-06-14 11:44:35 -04:00
ensure
db . close
2015-07-23 18:37:40 -04:00
end
def import_replies
puts " " , " creating topic replies "
2016-06-14 11:44:35 -04:00
db = open_db
replies = db . execute ( " SELECT msg_id,
from_email ,
from_name ,
title ,
email_date ,
message ,
reply_to
FROM emails
WHERE reply_to IS NOT NULL " )
2015-07-23 18:37:40 -04:00
post_count = replies . size
batches ( BATCH_SIZE ) do | offset |
posts = replies [ offset .. offset + BATCH_SIZE - 1 ]
break if posts . nil?
2016-06-14 11:44:35 -04:00
next if all_records_exist? :posts , posts . map { | p | p [ 0 ] }
2015-09-21 19:48:42 -04:00
2015-07-23 18:37:40 -04:00
create_posts ( posts , total : post_count , offset : offset ) do | p |
2016-06-14 11:44:35 -04:00
parent_id = p [ 6 ]
id = p [ 0 ]
2015-07-23 18:37:40 -04:00
topic = topic_lookup_from_imported_post_id ( parent_id )
topic_id = topic [ :topic_id ] if topic
next unless topic_id
2016-06-14 11:44:35 -04:00
raw_email = p [ 5 ]
2016-03-07 17:15:57 -05:00
receiver = Email :: Receiver . new ( raw_email )
2015-07-23 18:37:40 -04:00
mail = Mail . read_from_string ( raw_email )
mail . body
2016-06-20 14:58:53 -04:00
from_email , _ = extract_name ( mail )
2016-03-07 17:15:57 -05:00
selected = receiver . select_body
2016-06-14 11:44:35 -04:00
selected = selected . join ( '' ) if selected . kind_of? ( Array )
next unless selected
2015-07-23 18:37:40 -04:00
raw = selected . force_encoding ( selected . encoding ) . encode ( " UTF-8 " )
2016-03-24 11:05:58 -04:00
# import the attachments
mail . attachments . each do | attachment |
tmp = Tempfile . new ( " discourse-email-attachment " )
begin
# read attachment
File . open ( tmp . path , " w+b " ) { | f | f . write attachment . body . decoded }
# create the upload for the user
2016-06-20 14:58:53 -04:00
upload = Upload . create_for ( user_id_from_imported_user_id ( from_email ) || Discourse :: SYSTEM_USER_ID , tmp , attachment . filename , tmp . size )
2016-03-24 11:05:58 -04:00
if upload && upload . errors . empty?
raw << " \n \n #{ receiver . attachment_markdown ( upload ) } \n \n "
end
ensure
tmp . try ( :close! ) rescue nil
end
end
2015-07-23 18:37:40 -04:00
{ id : id ,
topic_id : topic_id ,
2016-06-20 14:58:53 -04:00
user_id : user_id_from_imported_user_id ( from_email ) || Discourse :: SYSTEM_USER_ID ,
2015-07-23 18:37:40 -04:00
created_at : mail . date ,
2015-10-22 15:02:53 -04:00
raw : clean_raw ( raw ) ,
2015-07-23 18:37:40 -04:00
cook_method : Post . cook_methods [ :email ] }
end
end
2016-06-14 11:44:35 -04:00
ensure
db . close
2015-07-23 18:37:40 -04:00
end
end
ImportScripts :: Mbox . new . perform