From d0095ce0415bd7cb77998dd37269752dc6988135 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9gis=20Hanol?= Date: Mon, 11 Aug 2014 18:44:17 +0200 Subject: [PATCH] FEATURE: migrate vanilla importer to new import architecture --- lib/tasks/vanilla.thor | 354 ------------------------------- script/import_scripts/base.rb | 31 ++- script/import_scripts/vanilla.rb | 233 +++++++++++++++++++- 3 files changed, 253 insertions(+), 365 deletions(-) delete mode 100644 lib/tasks/vanilla.thor diff --git a/lib/tasks/vanilla.thor b/lib/tasks/vanilla.thor deleted file mode 100644 index dba425077..000000000 --- a/lib/tasks/vanilla.thor +++ /dev/null @@ -1,354 +0,0 @@ -require "csv" - -class Vanilla < Thor - - desc "import", "Imports posts from a Vanilla export" - method_option :file, aliases: '-f', required: true, desc: "The vanilla file to import" - - def import - - unless File.exist?(options[:file]) - puts "File '#{options[:file]}' not found" - exit 1 - end - - load_rails - - file = read_file(options[:file]) - parse_file(file) - - disable_rate_limiter - - create_users - create_user_memberships - - create_categories - create_topics - create_posts - update_topic_statuses - - create_private_topics - create_private_posts - ensure - enable_rate_limiter - end - - no_commands do - - def load_rails - puts "loading rails..." - require "./config/environment" - end - - def disable_rate_limiter - puts "disabling rate limiter..." - RateLimiter.disable - end - - def read_file(file) - puts "reading file..." - string = File.read(file).gsub("\\N", "").gsub(/\\$\n/m, "\\n").gsub("\\,", ",").gsub(/(?/i, "\n```\n") - .gsub(/<\/?code\s*>/i, "`") - .gsub("<", "<") - .gsub(">", ">") - end - - def enable_rate_limiter - puts "enabling rate limiter..." - RateLimiter.enable - end - - end - -end diff --git a/script/import_scripts/base.rb b/script/import_scripts/base.rb index eb77dce45..1115b0ade 100644 --- a/script/import_scripts/base.rb +++ b/script/import_scripts/base.rb @@ -23,8 +23,8 @@ class ImportScripts::Base @failed_groups = [] @existing_users = {} @failed_users = [] - @categories = {} - @posts = {} + @categories_lookup = {} + @existing_posts = {} @topic_lookup = {} GroupCustomField.where(name: 'import_id').pluck(:group_id, :value).each do |group_id, import_id| @@ -36,11 +36,11 @@ class ImportScripts::Base end CategoryCustomField.where(name: 'import_id').pluck(:category_id, :value).each do |category_id, import_id| - @categories[import_id] = Category.find(category_id.to_i) + @categories_lookup[import_id] = Category.find(category_id.to_i) end PostCustomField.where(name: 'import_id').pluck(:post_id, :value).each do |post_id, import_id| - @posts[import_id] = post_id + @existing_posts[import_id] = post_id end Post.pluck(:id, :topic_id, :post_number).each do |post_id,t,n| @@ -81,7 +81,7 @@ class ImportScripts::Base # Get the Discourse Post id based on the id of the source record def post_id_from_imported_post_id(import_id) - @posts[import_id] || @posts[import_id.to_s] + @existing_posts[import_id] || @existing_posts[import_id.to_s] end # Get the Discourse topic info (a hash) based on the id of the source record @@ -110,7 +110,7 @@ class ImportScripts::Base # Get the Discourse Category id based on the id of the source category def category_from_imported_category_id(import_id) - @categories[import_id] || @categories[import_id.to_s] + @categories_lookup[import_id] || @categories_lookup[import_id.to_s] end def create_admin(opts={}) @@ -132,6 +132,8 @@ class ImportScripts::Base # group in the original datasource. The given id will not be used # to create the Discourse group record. def create_groups(results, opts={}) + puts "", "creating groups" + groups_created = 0 groups_skipped = 0 total = opts[:total] || results.size @@ -180,6 +182,8 @@ class ImportScripts::Base # user in the original datasource. The given id will not be used to # create the Discourse user record. def create_users(results, opts={}) + puts "", "creating users" + num_users_before = User.count users_created = 0 users_skipped = 0 @@ -189,6 +193,11 @@ class ImportScripts::Base results.each do |result| u = yield(result) + if u.nil? + users_skipped += 1 + next # block returns nil to skip a post + end + if user_id_from_imported_user_id(u[:id]) users_skipped += 1 elsif u[:email].present? @@ -271,7 +280,7 @@ class ImportScripts::Base end new_category = create_category(params, params[:id]) - @categories[params[:id]] = new_category + @categories_lookup[params[:id]] = new_category end end @@ -282,7 +291,7 @@ class ImportScripts::Base post_create_action = opts.delete(:post_create_action) new_category = Category.new( name: opts[:name], - user_id: -1, + user_id: opts[:user_id] || opts[:user].try(:id) || -1, position: opts[:position], description: opts[:description], parent_category_id: opts[:parent_category_id] @@ -299,6 +308,8 @@ class ImportScripts::Base # Topics should give attributes title and category. # Replies should provide topic_id. Use topic_lookup_from_imported_post_id to find the topic. def create_posts(results, opts={}) + puts "", "creating posts" + skipped = 0 created = 0 total = opts[:total] || results.size @@ -319,7 +330,7 @@ class ImportScripts::Base begin new_post = create_post(params, import_id) if new_post.is_a?(Post) - @posts[import_id] = new_post.id + @existing_posts[import_id] = new_post.id @topic_lookup[new_post.id] = {post_number: new_post.post_number, topic_id: new_post.topic_id} created += 1 @@ -330,7 +341,7 @@ class ImportScripts::Base end rescue => e skipped += 1 - puts "Error creating post #{import_id}. Skipping." + puts "Exception while creating post #{import_id}. Skipping." puts e.message rescue Discourse::InvalidAccess => e skipped += 1 diff --git a/script/import_scripts/vanilla.rb b/script/import_scripts/vanilla.rb index 78cd985bf..2bcbcc774 100644 --- a/script/import_scripts/vanilla.rb +++ b/script/import_scripts/vanilla.rb @@ -1 +1,232 @@ -# TODO: need to migrate it from https://github.com/discourse/discourse/blob/master/lib/tasks/vanilla.thor +require File.expand_path(File.dirname(__FILE__) + "/base.rb") +require "csv" + +class ImportScripts::Vanilla < ImportScripts::Base + + def initialize + super + + @vanilla_file = ARGV[0] + raise ArgumentError.new('Vanilla file argument missing. Provide full path to vanilla csv file.') if @vanilla_file.blank? + end + + def execute + check_file_exist + parse_file + + import_users + import_categories + + import_topics + import_posts + + import_private_topics + import_private_posts + end + + private + + def check_file_exist + raise ArgumentError.new("File does not exist: #{@vanilla_file}") unless File.exist?(@vanilla_file) + end + + def parse_file + puts "parsing file..." + file = read_file + + # TODO: parse header & validate version number + header = file.readline + + until file.eof? + line = file.readline + next if line.blank? + next if line.start_with?("//") + + if m = /^Table: (\w+)/.match(line) + # extract table name + table = m[1].underscore.pluralize + # read the data until an empty line + data = [] + # first line is the table definition, turn that into a proper csv header + data << file.readline.split(",").map { |c| c.split(":")[0].underscore }.join(",") + until (line = file.readline).blank? + data << line.strip + end + # PERF: don't parse useless tables + next if ["activities", "user_meta"].include? table + # parse the data + puts "parsing #{table}..." + parsed_data = CSV.parse(data.join("\n"), headers: true, header_converters: :symbol).map { |row| row.to_hash } + instance_variable_set("@#{table}".to_sym, parsed_data) + end + end + end + + def read_file + puts "reading file..." + string = File.read(@vanilla_file).gsub("\\N", "") + .gsub(/\\$\n/m, "\\n") + .gsub("\\,", ",") + .gsub(/(? 0 + puts "", "importing first-level categories..." + create_categories(first_level_categories) { |category| import_category(category) } + + # adds other categories + second_level_categories = @categories.select { |c| c[:parent_category_id] != "-1" } + if second_level_categories.count > 0 + puts "", "importing second-level categories..." + create_categories(second_level_categories) { |category| import_category(category) } + end + end + end + + def import_category(category) + c = { + id: category[:category_id], + name: category[:name], + user_id: user_id_from_imported_user_id(category[:insert_user_id]), + position: category[:sort].to_i, + created_at: parse_category_date(category[:date_inserted]), + description: clean_up(category[:description]), + } + if category[:parent_category_id] != "-1" + parent_category = category_from_imported_category_id(category[:parent_category_id].to_i) + c[:parent_category_id] = parent_category[:id] if parent_category + end + c + end + + def parse_category_date(date) + date == "0000-00-00 00:00:00" ? @root_category_created_at : parse_date(date) + end + + def import_topics + puts "", "importing topics..." + + create_posts(@discussions) do |discussion| + { + id: "discussion#" + discussion[:discussion_id], + user_id: user_id_from_imported_user_id(discussion[:insert_user_id]), + title: discussion[:name], + category_id: category_from_imported_category_id(discussion[:category_id]).try(:id), + raw: clean_up(discussion[:body]), + created_at: parse_date(discussion[:date_inserted]), + } + end + end + + def import_posts + puts "", "importing posts..." + + create_posts(@comments) do |comment| + next unless t = topic_lookup_from_imported_post_id("discussion#" + comment[:discussion_id]) + + { + id: "comment#" + comment[:comment_id], + user_id: user_id_from_imported_user_id(comment[:insert_user_id]), + topic_id: t[:topic_id], + raw: clean_up(comment[:body]), + created_at: parse_date(comment[:date_inserted]), + } + end + end + + def import_private_topics + puts "", "importing private topics..." + + create_posts(@conversations) do |conversation| + # list all other user ids in the conversation + user_ids_in_conversation = @user_conversations.select { |uc| uc[:conversation_id] == conversation[:conversation_id] && uc[:user_id] != conversation[:insert_user_id] } + .map { |uc| uc[:user_id] } + # retrieve their emails + user_emails_in_conversation = @users.select { |u| user_ids_in_conversation.include?(u[:user_id]) } + .map { |u| u[:email] } + # retrieve their usernames from the database + target_usernames = User.where("email IN (?)", user_emails_in_conversation).pluck(:username).to_a + + next if target_usernames.blank? + + user = find_user_by_import_id(conversation[:insert_user_id]) || Discourse.system_user + first_message = @conversation_messages.select { |cm| cm[:message_id] == conversation[:first_message_id] }.first + + { + archetype: Archetype.private_message, + id: "conversation#" + conversation[:conversation_id], + user_id: user.id, + title: "Private message from #{user.username}", + target_usernames: target_usernames, + raw: clean_up(first_message[:body]), + created_at: parse_date(conversation[:date_inserted]), + } + end + end + + def import_private_posts + puts "", "importing private posts..." + + first_message_ids = Set.new(@conversations.map { |c| c[:first_message_id] }.to_a) + @conversation_messages.reject! { |cm| first_message_ids.include?(cm[:message_id]) } + + create_posts(@conversation_messages) do |message| + next unless t = topic_lookup_from_imported_post_id("conversation#" + message[:conversation_id]) + + { + archetype: Archetype.private_message, + id: "message#" + message[:message_id], + user_id: user_id_from_imported_user_id(message[:insert_user_id]), + topic_id: t[:topic_id], + raw: clean_up(message[:body]), + created_at: parse_date(message[:date_inserted]), + } + end + end + + def parse_date(date) + DateTime.strptime(date, "%Y-%m-%d %H:%M:%S") + end + + def clean_up(raw) + (raw || "").gsub("\\n", "\n") + .gsub(/<\/?pre\s*>/i, "\n```\n") + .gsub(/<\/?code\s*>/i, "`") + .gsub("<", "<") + .gsub(">", ">") + end + +end + +ImportScripts::Vanilla.new.perform