discourse/script/import_scripts/base.rb
Gerhard Schlager 2dd01c61b0 Improves the base importer
- Move some methods into their own classes in order to make it easier
  to reuse them outside of classes extending the base importer. For
  compatibility reasons the old methods are still in the base importer
  and delegate to the new objects. The following methods and hashes were
  extracted:
    - all the lookup maps for existing and imported data
    - all the methods used for uploads and attachments
- No need to store failed users and groups. This information wasn't
  used anyway.
- Print progress instead of category names when importing categories.
- Allow importers to override if bbcode_to_md should be used (until now
  it always used ARGV)
- Allow importers to add additional site settings that automatically get
  restored after the importer finishes.
- Show how many posts and messages are imported per minute. This should
  help detecting when the import is slowing down and needs to be
  restarted.
- Use max_image_width and max_image_height from settings instead of
  hard-coded values for uploaded images.
2015-07-16 15:28:42 +02:00

648 lines
19 KiB
Ruby

if ARGV.include?('bbcode-to-md')
# Replace (most) bbcode with markdown before creating posts.
# This will dramatically clean up the final posts in Discourse.
#
# In a temp dir:
#
# git clone https://github.com/nlalonde/ruby-bbcode-to-md.git
# cd ruby-bbcode-to-md
# gem build ruby-bbcode-to-md.gemspec
# gem install ruby-bbcode-to-md-*.gem
require 'ruby-bbcode-to-md'
end
require_relative '../../config/environment'
require_relative 'base/lookup_container'
require_relative 'base/uploader'
module ImportScripts; end
class ImportScripts::Base
include ActionView::Helpers::NumberHelper
def initialize
preload_i18n
@lookup = ImportScripts::LookupContainer.new
@uploader = ImportScripts::Uploader.new
@bbcode_to_md = true if use_bbcode_to_md?
@site_settings_during_import = {}
@old_site_settings = {}
@start_times = {import: Time.now}
end
def preload_i18n
I18n.t("test")
ActiveSupport::Inflector.transliterate("test")
end
def perform
Rails.logger.level = 3 # :error, so that we don't create log files that are many GB
change_site_settings
execute
puts ""
update_bumped_at
update_last_posted_at
update_last_seen_at
update_feature_topic_users
update_category_featured_topics
update_topic_count_replies
reset_topic_counters
elapsed = Time.now - @start_times[:import]
puts '', '', 'Done (%02dh %02dmin %02dsec)' % [elapsed/3600, elapsed/60%60, elapsed%60]
ensure
reset_site_settings
end
def get_site_settings_for_import
{
email_domains_blacklist: '',
min_topic_title_length: 1,
min_post_length: 1,
min_first_post_length: 1,
min_private_message_post_length: 1,
min_private_message_title_length: 1,
allow_duplicate_topic_titles: true,
disable_emails: true,
authorized_extensions: '*'
}
end
def change_site_settings
@site_settings_during_import = get_site_settings_for_import
@site_settings_during_import.each do |key, value|
@old_site_settings[key] = SiteSetting.send(key)
SiteSetting.set(key, value)
end
RateLimiter.disable
end
def reset_site_settings
@old_site_settings.each do |key, value|
current_value = SiteSetting.send(key)
SiteSetting.set(key, value) unless current_value != @site_settings_during_import[key]
end
RateLimiter.enable
end
def use_bbcode_to_md?
ARGV.include?("bbcode-to-md")
end
# Implementation will do most of its work in its execute method.
# It will need to call create_users, create_categories, and create_posts.
def execute
raise NotImplementedError
end
def post_id_from_imported_post_id(import_id)
@lookup.post_id_from_imported_post_id(import_id)
end
def topic_lookup_from_imported_post_id(import_id)
@lookup.topic_lookup_from_imported_post_id(import_id)
end
def group_id_from_imported_group_id(import_id)
@lookup.group_id_from_imported_group_id(import_id)
end
def find_group_by_import_id(import_id)
@lookup.find_group_by_import_id(import_id)
end
def user_id_from_imported_user_id(import_id)
@lookup.user_id_from_imported_user_id(import_id)
end
def find_user_by_import_id(import_id)
@lookup.find_user_by_import_id(import_id)
end
def category_id_from_imported_category_id(import_id)
@lookup.category_id_from_imported_category_id(import_id)
end
def create_admin(opts={})
admin = User.new
admin.email = opts[:email] || "sam.saffron@gmail.com"
admin.username = opts[:username] || "sam"
admin.password = SecureRandom.uuid
admin.save!
admin.grant_admin!
admin.change_trust_level!(TrustLevel[4])
admin.email_tokens.update_all(confirmed: true)
admin
end
# Iterate through a list of groups to be imported.
# Takes a collection and yields to the block for each element.
# Block should return a hash with the attributes for each element.
# Required fields are :id and :name, where :id is the id of the
# group in the original datasource. The given id will not be used
# to create the Discourse group record.
def create_groups(results, opts={})
created = 0
skipped = 0
failed = 0
total = opts[:total] || results.size
results.each do |result|
g = yield(result)
if @lookup.group_id_from_imported_group_id(g[:id])
skipped += 1
else
new_group = create_group(g, g[:id])
if new_group.valid?
@lookup.add_group(g[:id].to_s, new_group)
created += 1
else
failed += 1
puts "Failed to create group id #{g[:id]} #{new_group.name}: #{new_group.errors.full_messages}"
end
end
print_status created + skipped + failed + (opts[:offset] || 0), total
end
[created, skipped]
end
def create_group(opts, import_id)
opts = opts.dup.tap {|o| o.delete(:id) }
import_name = opts[:name]
opts[:name] = UserNameSuggester.suggest(import_name)
existing = Group.where(name: opts[:name]).first
return existing if existing and existing.custom_fields["import_id"].to_i == import_id.to_i
g = existing || Group.new(opts)
g.custom_fields["import_id"] = import_id
g.custom_fields["import_name"] = import_name
g.tap(&:save)
end
# Iterate through a list of user records to be imported.
# Takes a collection, and yields to the block for each element.
# Block should return a hash with the attributes for the User model.
# Required fields are :id and :email, where :id is the id of the
# user in the original datasource. The given id will not be used to
# create the Discourse user record.
def create_users(results, opts={})
created = 0
skipped = 0
failed = 0
total = opts[:total] || results.size
results.each do |result|
u = yield(result)
# block returns nil to skip a user
if u.nil?
skipped += 1
else
import_id = u[:id]
if @lookup.user_id_from_imported_user_id(import_id)
skipped += 1
elsif u[:email].present?
new_user = create_user(u, import_id)
if new_user.valid? && new_user.user_profile.valid?
@lookup.add_user(import_id.to_s, new_user)
created += 1
else
failed += 1
puts "Failed to create user id: #{import_id}, username: #{new_user.username}, email: #{new_user.email}"
puts "user errors: #{new_user.errors.full_messages}"
puts "user_profile errors: #{new_user.user_profiler.errors.full_messages}"
end
else
failed += 1
puts "Skipping user id #{import_id} because email is blank"
end
end
print_status created + skipped + failed + (opts[:offset] || 0), total
end
[created, skipped]
end
def create_user(opts, import_id)
opts.delete(:id)
merge = opts.delete(:merge)
post_create_action = opts.delete(:post_create_action)
existing = User.where(email: opts[:email].downcase, username: opts[:username]).first
return existing if existing && (merge || existing.custom_fields["import_id"].to_i == import_id.to_i)
bio_raw = opts.delete(:bio_raw)
website = opts.delete(:website)
location = opts.delete(:location)
avatar_url = opts.delete(:avatar_url)
opts[:name] = User.suggest_name(opts[:email]) unless opts[:name]
if opts[:username].blank? ||
opts[:username].length < User.username_length.begin ||
opts[:username].length > User.username_length.end ||
opts[:username] =~ /[^A-Za-z0-9_]/ ||
opts[:username][0] =~ /[^A-Za-z0-9]/ ||
!User.username_available?(opts[:username])
opts[:username] = UserNameSuggester.suggest(opts[:username] || opts[:name] || opts[:email])
end
opts[:email] = opts[:email].downcase
opts[:trust_level] = TrustLevel[1] unless opts[:trust_level]
opts[:active] = opts.fetch(:active, true)
opts[:import_mode] = true
opts[:last_emailed_at] = opts.fetch(:last_emailed_at, Time.now)
u = User.new(opts)
u.custom_fields["import_id"] = import_id
u.custom_fields["import_username"] = opts[:username] if opts[:username].present?
u.custom_fields["import_avatar_url"] = avatar_url if avatar_url.present?
begin
User.transaction do
u.save!
if bio_raw.present? || website.present? || location.present?
u.user_profile.bio_raw = bio_raw if bio_raw.present?
u.user_profile.website = website if website.present?
u.user_profile.location = location if location.present?
u.user_profile.save!
end
end
rescue
# try based on email
existing = User.find_by(email: opts[:email].downcase)
if existing
existing.custom_fields["import_id"] = import_id
existing.save!
u = existing
end
end
post_create_action.try(:call, u) if u.persisted?
u # If there was an error creating the user, u.errors has the messages
end
# Iterates through a collection to create categories.
# The block should return a hash with attributes for the new category.
# Required fields are :id and :name, where :id is the id of the
# category in the original datasource. The given id will not be used to
# create the Discourse category record.
# Optional attributes are position, description, and parent_category_id.
def create_categories(results)
created = 0
skipped = 0
total = results.size
results.each do |c|
params = yield(c)
# block returns nil to skip
if params.nil? || @lookup.category_id_from_imported_category_id(params[:id])
skipped += 1
else
# Basic massaging on the category name
params[:name] = "Blank" if params[:name].blank?
params[:name].strip!
params[:name] = params[:name][0..49]
# make sure categories don't go more than 2 levels deep
if params[:parent_category_id]
top = Category.find_by_id(params[:parent_category_id])
top = top.parent_category while top && !top.parent_category.nil?
params[:parent_category_id] = top.id if top
end
new_category = create_category(params, params[:id])
@lookup.add_category(params[:id], new_category)
created += 1
end
print_status created + skipped, total
end
[created, skipped]
end
def create_category(opts, import_id)
existing = Category.where("LOWER(name) = ?", opts[:name].downcase).first
return existing if existing && existing.parent_category.try(:id) == opts[:parent_category_id]
post_create_action = opts.delete(:post_create_action)
new_category = Category.new(
name: opts[:name],
user_id: opts[:user_id] || opts[:user].try(:id) || -1,
position: opts[:position],
description: opts[:description],
parent_category_id: opts[:parent_category_id],
color: opts[:color] || "AB9364",
text_color: opts[:text_color] || "FFF",
)
new_category.custom_fields["import_id"] = import_id if import_id
new_category.save!
post_create_action.try(:call, new_category)
new_category
end
def created_post(post)
# override if needed
end
# Iterates through a collection of posts to be imported.
# It can create topics and replies.
# Attributes will be passed to the PostCreator.
# Topics should give attributes title and category.
# Replies should provide topic_id. Use topic_lookup_from_imported_post_id to find the topic.
def create_posts(results, opts={})
skipped = 0
created = 0
total = opts[:total] || results.size
start_time = get_start_time("posts-#{total}") # the post count should be unique enough to differentiate between posts and PMs
results.each do |r|
params = yield(r)
# block returns nil to skip a post
if params.nil?
skipped += 1
else
import_id = params.delete(:id).to_s
if @lookup.post_id_from_imported_post_id(import_id)
skipped += 1 # already imported this post
else
begin
new_post = create_post(params, import_id)
if new_post.is_a?(Post)
@lookup.add_post(import_id, new_post)
@lookup.add_topic(new_post)
created_post(new_post)
created += 1
else
skipped += 1
puts "Error creating post #{import_id}. Skipping."
puts new_post.inspect
end
rescue Discourse::InvalidAccess => e
skipped += 1
puts "InvalidAccess creating post #{import_id}. Topic is closed? #{e.message}"
rescue => e
skipped += 1
puts "Exception while creating post #{import_id}. Skipping."
puts e.message
puts e.backtrace.join("\n")
end
end
end
print_status(created + skipped + (opts[:offset] || 0), total, start_time)
end
[created, skipped]
end
def create_post(opts, import_id)
user = User.find(opts[:user_id])
post_create_action = opts.delete(:post_create_action)
opts = opts.merge(skip_validations: true)
opts[:import_mode] = true
opts[:custom_fields] ||= {}
opts[:custom_fields]['import_id'] = import_id
if @bbcode_to_md
opts[:raw] = opts[:raw].bbcode_to_md(false) rescue opts[:raw]
end
post_creator = PostCreator.new(user, opts)
post = post_creator.create
post_create_action.try(:call, post) if post
post ? post : post_creator.errors.full_messages
end
def create_upload(user_id, path, source_filename)
@uploader.create_upload(user_id, path, source_filename)
end
# Iterate through a list of bookmark records to be imported.
# Takes a collection, and yields to the block for each element.
# Block should return a hash with the attributes for the bookmark.
# Required fields are :user_id and :post_id, where both ids are
# the values in the original datasource.
def create_bookmarks(results, opts={})
created = 0
skipped = 0
total = opts[:total] || results.size
user = User.new
post = Post.new
results.each do |result|
params = yield(result)
# only the IDs are needed, so this should be enough
if params.nil?
skipped += 1
else
user.id = @lookup.user_id_from_imported_user_id(params[:user_id])
post.id = @lookup.post_id_from_imported_post_id(params[:post_id])
if user.id.nil? || post.id.nil?
skipped += 1
puts "Skipping bookmark for user id #{params[:user_id]} and post id #{params[:post_id]}"
else
begin
PostAction.act(user, post, PostActionType.types[:bookmark])
created += 1
rescue PostAction::AlreadyActed
skipped += 1
end
end
end
print_status created + skipped + (opts[:offset] || 0), total
end
[created, skipped]
end
def close_inactive_topics(opts={})
num_days = opts[:days] || 30
puts '', "Closing topics that have been inactive for more than #{num_days} days."
query = Topic.where('last_posted_at < ?', num_days.days.ago).where(closed: false)
total_count = query.count
closed_count = 0
query.find_each do |topic|
topic.update_status('closed', true, Discourse.system_user)
closed_count += 1
print_status(closed_count, total_count)
end
end
def update_bumped_at
puts "", "updating bumped_at on topics"
Post.exec_sql("update topics t set bumped_at = COALESCE((select max(created_at) from posts where topic_id = t.id and post_type != #{Post.types[:moderator_action]}), bumped_at)")
end
def update_last_posted_at
puts "", "updating last posted at on users"
sql = <<-SQL
WITH lpa AS (
SELECT user_id, MAX(posts.created_at) AS last_posted_at
FROM posts
GROUP BY user_id
)
UPDATE users
SET last_posted_at = lpa.last_posted_at
FROM users u1
JOIN lpa ON lpa.user_id = u1.id
WHERE u1.id = users.id
AND users.last_posted_at <> lpa.last_posted_at
SQL
User.exec_sql(sql)
end
# scripts that are able to import last_seen_at from the source data should override this method
def update_last_seen_at
puts "", "updating last seen at on users"
User.exec_sql("UPDATE users SET last_seen_at = created_at WHERE last_seen_at IS NULL")
User.exec_sql("UPDATE users SET last_seen_at = last_posted_at WHERE last_posted_at IS NOT NULL")
end
def update_feature_topic_users
puts "", "updating featured topic users"
total_count = Topic.count
progress_count = 0
Topic.find_each do |topic|
topic.feature_topic_users
progress_count += 1
print_status(progress_count, total_count)
end
end
def reset_topic_counters
puts "", "resetting topic counters"
total_count = Topic.count
progress_count = 0
Topic.find_each do |topic|
Topic.reset_highest(topic.id)
progress_count += 1
print_status(progress_count, total_count)
end
end
def update_category_featured_topics
puts "", "updating featured topics in categories"
total_count = Category.count
progress_count = 0
Category.find_each do |category|
CategoryFeaturedTopic.feature_topics_for(category)
progress_count += 1
print_status(progress_count, total_count)
end
end
def update_topic_count_replies
puts "", "updating user topic reply counts"
total_count = User.real.count
progress_count = 0
User.real.find_each do |u|
u.user_stat.update_topic_reply_count
u.user_stat.save!
progress_count += 1
print_status(progress_count, total_count)
end
end
def update_tl0
puts "", "setting users with no posts to trust level 0"
total_count = User.count
progress_count = 0
User.find_each do |user|
begin
user.change_trust_level!(0) if Post.where(user_id: user.id).count == 0
rescue Discourse::InvalidAccess
nil
end
progress_count += 1
print_status(progress_count, total_count)
end
end
def html_for_upload(upload, display_filename)
@uploader.html_for_upload(upload, display_filename)
end
def embedded_image_html(upload)
@uploader.embedded_image_html(upload)
end
def attachment_html(upload, display_filename)
@uploader.attachment_html(upload, display_filename)
end
def print_status(current, max, start_time = nil)
if start_time.present?
elapsed_seconds = Time.now - start_time
elements_per_minute = '[%.0f items/min] ' % [current / elapsed_seconds.to_f * 60]
else
elements_per_minute = ''
end
print "\r%9d / %d (%5.1f%%) %s" % [current, max, current / max.to_f * 100, elements_per_minute]
end
def print_spinner
@spinner_chars ||= %w{ | / - \\ }
@spinner_chars.push @spinner_chars.shift
print "\b#{@spinner_chars[0]}"
end
def get_start_time(key)
@start_times.fetch(key) {|k| @start_times[k] = Time.now}
end
def batches(batch_size)
offset = 0
loop do
yield offset
offset += batch_size
end
end
end