FEATURE: migrate vanilla importer to new import architecture

This commit is contained in:
Régis Hanol 2014-08-11 18:44:17 +02:00
parent ec6ba68098
commit d0095ce041
3 changed files with 253 additions and 365 deletions

View file

@ -1,354 +0,0 @@
require "csv"
class Vanilla < Thor
desc "import", "Imports posts from a Vanilla export"
method_option :file, aliases: '-f', required: true, desc: "The vanilla file to import"
def import
unless File.exist?(options[:file])
puts "File '#{options[:file]}' not found"
exit 1
end
load_rails
file = read_file(options[:file])
parse_file(file)
disable_rate_limiter
create_users
create_user_memberships
create_categories
create_topics
create_posts
update_topic_statuses
create_private_topics
create_private_posts
ensure
enable_rate_limiter
end
no_commands do
def load_rails
puts "loading rails..."
require "./config/environment"
end
def disable_rate_limiter
puts "disabling rate limiter..."
RateLimiter.disable
end
def read_file(file)
puts "reading file..."
string = File.read(file).gsub("\\N", "").gsub(/\\$\n/m, "\\n").gsub("\\,", ",").gsub(/(?<!\\)\\"/, '""').gsub(/\\\\\\"/, '\\""')
StringIO.new(string)
end
def parse_file(file)
# TODO: parse header & validate version number
header = file.readline
until file.eof?
line = file.readline
next if line.blank?
next if line.start_with?("//")
if m = /^Table: (\w+)/.match(line)
# extract table name
table = m[1]
# read the data until an empty line
data = []
# first line is the table definition, turn that into a proper csv header
data << file.readline.split(",").map { |c| c.split(":")[0].underscore }.join(",")
until (line = file.readline).blank?
data << line.strip
end
# parse the data
puts "parsing #{table.underscore.pluralize}..."
parsed_data = CSV.parse(data.join("\n"), headers: true, header_converters: :symbol).map { |row| row.to_hash }
instance_variable_set("@#{table.underscore.pluralize}".to_sym, parsed_data)
end
end
end
def create_users
puts "creating users..."
users_created = 0
@users.each do |user|
begin
next if user[:name] == "[Deleted User]"
user[:new_id] = User.create!(
name: user[:name],
email: user[:email],
username: UserNameSuggester.suggest(user[:name]),
created_at: DateTime.strptime(user[:date_inserted], "%Y-%m-%d %H:%M:%S"),
trust_level: TrustLevel.levels[:basic],
bio_raw: clean_up(user[:discovery_text])
).id
users_created += 1
rescue ActiveRecord::RecordInvalid
# email has already been taken...
end
end
puts "created #{users_created} users!"
end
def create_user_memberships
puts "creating user memberships..."
add_administrators
add_moderators
end
def add_administrators
puts "granting admin rights..."
admin_role_id = @roles.select { |r| r[:name] == "Administrator" }.first[:role_id]
admin_emails = @user_roles.select { |ur| ur[:role_id] == admin_role_id }.map { |ur| @users.select { |u| u[:user_id] == ur[:user_id] }.first[:email] }
admin_emails.each { |admin_email| User.where(email: admin_email).first.grant_admin! }
puts "#{admin_emails.size} admins!"
end
def add_moderators
puts "granting moderation rights..."
moderator_role_id = @roles.select { |r| r[:name] == "Moderator" }.first[:role_id]
moderator_emails = @user_roles.select { |ur| ur[:role_id] == moderator_role_id }.map { |ur| @users.select { |u| u[:user_id] == ur[:user_id] }.first[:email] }
moderator_emails.each { |admin_email| User.where(email: admin_email).first.grant_moderation! }
puts "#{moderator_emails.size} moderators!"
end
def create_categories
puts "creating categories..."
categories_created = 0
level_1_category_ids = Set.new
# save some information about the root category
@root_category = @categories.select { |c| c[:category_id] == "-1" }.first
@root_category_created_at = DateTime.strptime(@root_category[:date_inserted], "%Y-%m-%d %H:%M:%S")
# removes root category
@categories.reject! { |c| c[:category_id] == "-1" }
# adds root's child categories
@categories.select { |c| c[:parent_category_id] == "-1" }.each do |category|
level_1_category_ids << category[:category_id].to_i
category[:new_id] = create_category(category)
categories_created += 1
end
# adds other categories
@categories.select { |c| level_1_category_ids.include? c[:parent_category_id].to_i }.each do |category|
new_parent_category_id = @categories.select { |c| c[:category_id] == category[:parent_category_id] }.first[:new_id]
category[:new_id] = create_category(category, new_parent_category_id)
categories_created += 1
end
puts "created #{categories_created} categories!"
end
def create_category(category, new_parent_category_id=nil)
new_category = Category.create!(
name: category[:name],
color: "AB9364",
text_color: "FFF",
position: category[:sort].to_i,
user: get_user_by_previous_id(category[:insert_user_id]) || Discourse.system_user,
created_at: parse_category_date(category[:date_inserted]),
description: clean_up(category[:description]),
parent_category_id: new_parent_category_id
)
# return the new category id
new_category.id
end
def parse_category_date(date)
date == "0000-00-00 00:00:00" ? @root_category_created_at : DateTime.strptime(date, "%Y-%m-%d %H:%M:%S")
end
def create_topics
puts "creating topics..."
topics_created = 0
@discussions.each do |discussion|
user = get_user_by_previous_id(discussion[:insert_user_id]) || Discourse.system_user
discussion[:created_at] = DateTime.strptime(discussion[:date_inserted], "%Y-%m-%d %H:%M:%S")
options = {
title: discussion[:name],
raw: clean_up(discussion[:body]),
created_at: discussion[:created_at],
skip_validations: true
}
options[:category] = get_category_by_previous_id(discussion[:category_id]).try(:name) if discussion[:category_id]
post = PostCreator.new(user, options).create
discussion[:new_id] = post.topic.id
topics_created += 1
end
puts "created #{topics_created} topics!"
end
def create_posts
puts "creating posts..."
posts_created = 0
@comments.each do |comment|
discussion = @discussions.select { |d| d[:discussion_id] == comment[:discussion_id] }.first
unless discussion && discussion[:new_id]
puts "could not find discussion ##{comment[:discussion_id]}"
next
end
topic_id = discussion[:new_id]
user = get_user_by_previous_id(comment[:insert_user_id]) || Discourse.system_user
options = {
topic_id: topic_id,
raw: clean_up(comment[:body]),
created_at: DateTime.strptime(comment[:date_inserted], "%Y-%m-%d %H:%M:%S"),
skip_validations: true
}
post = PostCreator.new(user, options).create
comment[:new_id] = post.id
posts_created += 1
end
puts "created #{posts_created} posts!"
end
def update_topic_statuses
puts "updating topic statuses..."
@discussions.each do |discussion|
next unless topic_id = discussion[:new_id]
# HACK: make sure both bumped_at and last_posted_at are properly set
sql = <<-SQL
UPDATE topics
SET views = :views,
closed = :closed,
pinned_at = :pinned_at,
last_posted_at = (SELECT MAX(created_at) FROM posts WHERE topic_id = :topic_id),
bumped_at = (SELECT MAX(created_at) FROM posts WHERE topic_id = :topic_id)
WHERE id = :topic_id
SQL
Topic.exec_sql(sql,
views: discussion[:count_views].to_i,
closed: discussion[:closed] == "1",
pinned_at: discussion[:announce] == "1" ? discussion[:created_at] : nil,
topic_id: topic_id
)
end
end
def create_private_topics
puts "creating private topics..."
private_topics_created = 0
@conversations.each do |conversation|
# select the first conversation message
message = @conversation_messages.select { |cm| cm[:message_id] == conversation[:first_message_id] }.first
# list all other user ids in the conversation
user_ids_in_conversation = @user_conversations.select { |uc| uc[:conversation_id] == conversation[:conversation_id] && uc[:user_id] != conversation[:insert_user_id] }.map { |uc| uc[:user_id] }
# retrieve their emails
user_emails_in_conversation = @users.select { |u| user_ids_in_conversation.include?(u[:user_id]) }.map { |u| u[:email] }
# retrieve their usernames from the database
target_usernames = User.where("email in (?)", user_emails_in_conversation).pluck(:username).to_a
next if target_usernames.empty?
user = get_user_by_previous_id(conversation[:insert_user_id]) || Discourse.system_user
options = {
archetype: Archetype::private_message,
title: "Private message from #{user.username}",
raw: clean_up(message[:body]),
target_usernames: target_usernames.join(","),
created_at: DateTime.strptime(conversation[:date_inserted], "%Y-%m-%d %H:%M:%S"),
skip_validations: true
}
post = PostCreator.new(user, options).create
conversation[:new_id] = post.topic.id
private_topics_created += 1
end
puts "created #{private_topics_created} private topics!"
end
def create_private_posts
puts "creating private posts..."
private_posts_created = 0
@conversation_messages.each do |message|
conversation = @conversations.select { |c| c[:conversation_id] == message[:conversation_id] }.first
next if conversation[:first_message_id] == message[:message_id]
next unless topic_id = conversation[:new_id]
user = get_user_by_previous_id(message[:insert_user_id]) || Discourse.system_user
options = {
topic_id: topic_id,
raw: clean_up(message[:body]),
created_at: DateTime.strptime(message[:date_inserted], "%Y-%m-%d %H:%M:%S"),
skip_validations: true
}
post = PostCreator.new(user, options).create
next unless post && post.errors.empty?
message[:new_id] = post.id
private_posts_created += 1
end
puts "created #{private_posts_created} private posts!"
end
def get_user_by_previous_id(previous_id)
user = @users.select { |u| u[:user_id] == previous_id }.first
User.find(user[:new_id]) if user && user[:new_id]
end
def get_category_by_previous_id(previous_id)
category = @categories.select { |c| c[:category_id] == previous_id }.first
Category.find(category[:new_id]) if category && category[:new_id]
end
def clean_up(raw)
(raw || "").gsub("\\n", "\n")
.gsub(/<\/?pre\s*>/i, "\n```\n")
.gsub(/<\/?code\s*>/i, "`")
.gsub("&lt;", "<")
.gsub("&gt;", ">")
end
def enable_rate_limiter
puts "enabling rate limiter..."
RateLimiter.enable
end
end
end

View file

@ -23,8 +23,8 @@ class ImportScripts::Base
@failed_groups = [] @failed_groups = []
@existing_users = {} @existing_users = {}
@failed_users = [] @failed_users = []
@categories = {} @categories_lookup = {}
@posts = {} @existing_posts = {}
@topic_lookup = {} @topic_lookup = {}
GroupCustomField.where(name: 'import_id').pluck(:group_id, :value).each do |group_id, import_id| GroupCustomField.where(name: 'import_id').pluck(:group_id, :value).each do |group_id, import_id|
@ -36,11 +36,11 @@ class ImportScripts::Base
end end
CategoryCustomField.where(name: 'import_id').pluck(:category_id, :value).each do |category_id, import_id| CategoryCustomField.where(name: 'import_id').pluck(:category_id, :value).each do |category_id, import_id|
@categories[import_id] = Category.find(category_id.to_i) @categories_lookup[import_id] = Category.find(category_id.to_i)
end end
PostCustomField.where(name: 'import_id').pluck(:post_id, :value).each do |post_id, import_id| PostCustomField.where(name: 'import_id').pluck(:post_id, :value).each do |post_id, import_id|
@posts[import_id] = post_id @existing_posts[import_id] = post_id
end end
Post.pluck(:id, :topic_id, :post_number).each do |post_id,t,n| Post.pluck(:id, :topic_id, :post_number).each do |post_id,t,n|
@ -81,7 +81,7 @@ class ImportScripts::Base
# Get the Discourse Post id based on the id of the source record # Get the Discourse Post id based on the id of the source record
def post_id_from_imported_post_id(import_id) def post_id_from_imported_post_id(import_id)
@posts[import_id] || @posts[import_id.to_s] @existing_posts[import_id] || @existing_posts[import_id.to_s]
end end
# Get the Discourse topic info (a hash) based on the id of the source record # Get the Discourse topic info (a hash) based on the id of the source record
@ -110,7 +110,7 @@ class ImportScripts::Base
# Get the Discourse Category id based on the id of the source category # Get the Discourse Category id based on the id of the source category
def category_from_imported_category_id(import_id) def category_from_imported_category_id(import_id)
@categories[import_id] || @categories[import_id.to_s] @categories_lookup[import_id] || @categories_lookup[import_id.to_s]
end end
def create_admin(opts={}) def create_admin(opts={})
@ -132,6 +132,8 @@ class ImportScripts::Base
# group in the original datasource. The given id will not be used # group in the original datasource. The given id will not be used
# to create the Discourse group record. # to create the Discourse group record.
def create_groups(results, opts={}) def create_groups(results, opts={})
puts "", "creating groups"
groups_created = 0 groups_created = 0
groups_skipped = 0 groups_skipped = 0
total = opts[:total] || results.size total = opts[:total] || results.size
@ -180,6 +182,8 @@ class ImportScripts::Base
# user in the original datasource. The given id will not be used to # user in the original datasource. The given id will not be used to
# create the Discourse user record. # create the Discourse user record.
def create_users(results, opts={}) def create_users(results, opts={})
puts "", "creating users"
num_users_before = User.count num_users_before = User.count
users_created = 0 users_created = 0
users_skipped = 0 users_skipped = 0
@ -189,6 +193,11 @@ class ImportScripts::Base
results.each do |result| results.each do |result|
u = yield(result) u = yield(result)
if u.nil?
users_skipped += 1
next # block returns nil to skip a post
end
if user_id_from_imported_user_id(u[:id]) if user_id_from_imported_user_id(u[:id])
users_skipped += 1 users_skipped += 1
elsif u[:email].present? elsif u[:email].present?
@ -271,7 +280,7 @@ class ImportScripts::Base
end end
new_category = create_category(params, params[:id]) new_category = create_category(params, params[:id])
@categories[params[:id]] = new_category @categories_lookup[params[:id]] = new_category
end end
end end
@ -282,7 +291,7 @@ class ImportScripts::Base
post_create_action = opts.delete(:post_create_action) post_create_action = opts.delete(:post_create_action)
new_category = Category.new( new_category = Category.new(
name: opts[:name], name: opts[:name],
user_id: -1, user_id: opts[:user_id] || opts[:user].try(:id) || -1,
position: opts[:position], position: opts[:position],
description: opts[:description], description: opts[:description],
parent_category_id: opts[:parent_category_id] parent_category_id: opts[:parent_category_id]
@ -299,6 +308,8 @@ class ImportScripts::Base
# Topics should give attributes title and category. # Topics should give attributes title and category.
# Replies should provide topic_id. Use topic_lookup_from_imported_post_id to find the topic. # Replies should provide topic_id. Use topic_lookup_from_imported_post_id to find the topic.
def create_posts(results, opts={}) def create_posts(results, opts={})
puts "", "creating posts"
skipped = 0 skipped = 0
created = 0 created = 0
total = opts[:total] || results.size total = opts[:total] || results.size
@ -319,7 +330,7 @@ class ImportScripts::Base
begin begin
new_post = create_post(params, import_id) new_post = create_post(params, import_id)
if new_post.is_a?(Post) if new_post.is_a?(Post)
@posts[import_id] = new_post.id @existing_posts[import_id] = new_post.id
@topic_lookup[new_post.id] = {post_number: new_post.post_number, topic_id: new_post.topic_id} @topic_lookup[new_post.id] = {post_number: new_post.post_number, topic_id: new_post.topic_id}
created += 1 created += 1
@ -330,7 +341,7 @@ class ImportScripts::Base
end end
rescue => e rescue => e
skipped += 1 skipped += 1
puts "Error creating post #{import_id}. Skipping." puts "Exception while creating post #{import_id}. Skipping."
puts e.message puts e.message
rescue Discourse::InvalidAccess => e rescue Discourse::InvalidAccess => e
skipped += 1 skipped += 1

View file

@ -1 +1,232 @@
# TODO: need to migrate it from https://github.com/discourse/discourse/blob/master/lib/tasks/vanilla.thor require File.expand_path(File.dirname(__FILE__) + "/base.rb")
require "csv"
class ImportScripts::Vanilla < ImportScripts::Base
def initialize
super
@vanilla_file = ARGV[0]
raise ArgumentError.new('Vanilla file argument missing. Provide full path to vanilla csv file.') if @vanilla_file.blank?
end
def execute
check_file_exist
parse_file
import_users
import_categories
import_topics
import_posts
import_private_topics
import_private_posts
end
private
def check_file_exist
raise ArgumentError.new("File does not exist: #{@vanilla_file}") unless File.exist?(@vanilla_file)
end
def parse_file
puts "parsing file..."
file = read_file
# TODO: parse header & validate version number
header = file.readline
until file.eof?
line = file.readline
next if line.blank?
next if line.start_with?("//")
if m = /^Table: (\w+)/.match(line)
# extract table name
table = m[1].underscore.pluralize
# read the data until an empty line
data = []
# first line is the table definition, turn that into a proper csv header
data << file.readline.split(",").map { |c| c.split(":")[0].underscore }.join(",")
until (line = file.readline).blank?
data << line.strip
end
# PERF: don't parse useless tables
next if ["activities", "user_meta"].include? table
# parse the data
puts "parsing #{table}..."
parsed_data = CSV.parse(data.join("\n"), headers: true, header_converters: :symbol).map { |row| row.to_hash }
instance_variable_set("@#{table}".to_sym, parsed_data)
end
end
end
def read_file
puts "reading file..."
string = File.read(@vanilla_file).gsub("\\N", "")
.gsub(/\\$\n/m, "\\n")
.gsub("\\,", ",")
.gsub(/(?<!\\)\\"/, '""')
.gsub(/\\\\\\"/, '\\""')
StringIO.new(string)
end
def import_users
admin_role_id = @roles.select { |r| r[:name] == "Administrator" }.first[:role_id]
moderator_role_id = @roles.select { |r| r[:name] == "Moderator" }.first[:role_id]
create_users(@users) do |user|
next if user[:name] == "[Deleted User]"
{
id: user[:user_id],
email: user[:email],
name: user[:name],
created_at: parse_date(user[:date_inserted]),
bio_raw: clean_up(user[:discovery_text]),
moderator: @user_roles.select { |ur| ur[:user_id] == user[:user_id] }.map { |ur| ur[:role_id] }.include?(moderator_role_id),
admin: @user_roles.select { |ur| ur[:user_id] == user[:user_id] }.map { |ur| ur[:role_id] }.include?(admin_role_id),
}
end
end
def import_categories
# save some information about the root category
@root_category = @categories.select { |c| c[:category_id] == "-1" }.first
@root_category_created_at = parse_date(@root_category[:date_inserted])
# removes root category
@categories.reject! { |c| c[:category_id] == "-1" }
# adds root's child categories
first_level_categories = @categories.select { |c| c[:parent_category_id] == "-1" }
if first_level_categories.count > 0
puts "", "importing first-level categories..."
create_categories(first_level_categories) { |category| import_category(category) }
# adds other categories
second_level_categories = @categories.select { |c| c[:parent_category_id] != "-1" }
if second_level_categories.count > 0
puts "", "importing second-level categories..."
create_categories(second_level_categories) { |category| import_category(category) }
end
end
end
def import_category(category)
c = {
id: category[:category_id],
name: category[:name],
user_id: user_id_from_imported_user_id(category[:insert_user_id]),
position: category[:sort].to_i,
created_at: parse_category_date(category[:date_inserted]),
description: clean_up(category[:description]),
}
if category[:parent_category_id] != "-1"
parent_category = category_from_imported_category_id(category[:parent_category_id].to_i)
c[:parent_category_id] = parent_category[:id] if parent_category
end
c
end
def parse_category_date(date)
date == "0000-00-00 00:00:00" ? @root_category_created_at : parse_date(date)
end
def import_topics
puts "", "importing topics..."
create_posts(@discussions) do |discussion|
{
id: "discussion#" + discussion[:discussion_id],
user_id: user_id_from_imported_user_id(discussion[:insert_user_id]),
title: discussion[:name],
category_id: category_from_imported_category_id(discussion[:category_id]).try(:id),
raw: clean_up(discussion[:body]),
created_at: parse_date(discussion[:date_inserted]),
}
end
end
def import_posts
puts "", "importing posts..."
create_posts(@comments) do |comment|
next unless t = topic_lookup_from_imported_post_id("discussion#" + comment[:discussion_id])
{
id: "comment#" + comment[:comment_id],
user_id: user_id_from_imported_user_id(comment[:insert_user_id]),
topic_id: t[:topic_id],
raw: clean_up(comment[:body]),
created_at: parse_date(comment[:date_inserted]),
}
end
end
def import_private_topics
puts "", "importing private topics..."
create_posts(@conversations) do |conversation|
# list all other user ids in the conversation
user_ids_in_conversation = @user_conversations.select { |uc| uc[:conversation_id] == conversation[:conversation_id] && uc[:user_id] != conversation[:insert_user_id] }
.map { |uc| uc[:user_id] }
# retrieve their emails
user_emails_in_conversation = @users.select { |u| user_ids_in_conversation.include?(u[:user_id]) }
.map { |u| u[:email] }
# retrieve their usernames from the database
target_usernames = User.where("email IN (?)", user_emails_in_conversation).pluck(:username).to_a
next if target_usernames.blank?
user = find_user_by_import_id(conversation[:insert_user_id]) || Discourse.system_user
first_message = @conversation_messages.select { |cm| cm[:message_id] == conversation[:first_message_id] }.first
{
archetype: Archetype.private_message,
id: "conversation#" + conversation[:conversation_id],
user_id: user.id,
title: "Private message from #{user.username}",
target_usernames: target_usernames,
raw: clean_up(first_message[:body]),
created_at: parse_date(conversation[:date_inserted]),
}
end
end
def import_private_posts
puts "", "importing private posts..."
first_message_ids = Set.new(@conversations.map { |c| c[:first_message_id] }.to_a)
@conversation_messages.reject! { |cm| first_message_ids.include?(cm[:message_id]) }
create_posts(@conversation_messages) do |message|
next unless t = topic_lookup_from_imported_post_id("conversation#" + message[:conversation_id])
{
archetype: Archetype.private_message,
id: "message#" + message[:message_id],
user_id: user_id_from_imported_user_id(message[:insert_user_id]),
topic_id: t[:topic_id],
raw: clean_up(message[:body]),
created_at: parse_date(message[:date_inserted]),
}
end
end
def parse_date(date)
DateTime.strptime(date, "%Y-%m-%d %H:%M:%S")
end
def clean_up(raw)
(raw || "").gsub("\\n", "\n")
.gsub(/<\/?pre\s*>/i, "\n```\n")
.gsub(/<\/?code\s*>/i, "`")
.gsub("&lt;", "<")
.gsub("&gt;", ">")
end
end
ImportScripts::Vanilla.new.perform