From 36e550ffeae313d3d9df0edd2ada91850dadf461 Mon Sep 17 00:00:00 2001 From: Erick Guan Date: Thu, 29 Jan 2015 15:06:30 +0800 Subject: [PATCH] add Discuz! X importer --- script/import_scripts/discuz_x.rb | 625 ++++++++++++++++++++++++++++++ 1 file changed, 625 insertions(+) create mode 100644 script/import_scripts/discuz_x.rb diff --git a/script/import_scripts/discuz_x.rb b/script/import_scripts/discuz_x.rb new file mode 100644 index 000000000..171cb96d0 --- /dev/null +++ b/script/import_scripts/discuz_x.rb @@ -0,0 +1,625 @@ +# encoding: utf-8 +# +# Author: Erick Guan +# +# This script import the data from latest Discuz! X +# Should work among Discuz! X3.x +# This script is tested only on Simplified Chinese Discuz! X instances +# If you want to import data other than Simplified Chinese, email me. + +require File.expand_path(File.dirname(__FILE__) + "/base.rb") + +require 'mysql2' + +class ImportScripts::DiscuzX < ImportScripts::Base + + DISCUZX_DB = "ultrax" + DB_TABLE_PREFIX = 'pre_' + BATCH_SIZE = 1000 + ORIGINAL_SITE_PREFIX = "oldsite.example.com/forums" # without http(s):// + NEW_SITE_PREFIX = "http://discourse.example.com" # with http:// or https:// + + # Set DISCUZX_BASE_DIR to the base directory of your discuz installation. + DISCUZX_BASE_DIR = '/var/www/discuz/upload' + AVATAR_DIR = '/uc_server/data/avatar' + ATTACHMENT_DIR = '/data/attachment/forum' + AUTHORIZED_EXTENSIONS = ['jpg', 'jpeg', 'png', 'gif', 'zip', 'rar', 'pdf'] + + def initialize + super + + @client = Mysql2::Client.new( + host: "localhost", + username: "root", + #password: "password", + database: DISCUZX_DB + ) + @first_post_id_by_topic_id = {} + end + + def execute + import_users + import_categories + import_posts + import_private_messages + import_attachments + end + + # add the prefix to the table name + def table_name(name = nil) + DB_TABLE_PREFIX + name + end + + # find which group members can be granted as admin + def get_knowledge_about_group + group_table = table_name 'common_usergroup' + result = mysql_query( + "SELECT groupid group_id, radminid role_id, type, grouptitle title + FROM #{group_table};") + @moderator_group_id = -1 + @admin_group_id = -1 + + result.each do |group| + role_id = group['role_id'] + group_id = group['group_id'] + case group['title'].strip + when '管理员' + @admin_admin_id = role_id + when '超级版主' + @moderator_admin_id = role_id + end + end + end + + def import_users + puts '', "creating users" + + get_knowledge_about_group + + sensitive_user_table = table_name 'ucenter_members' + user_table = table_name 'common_member' + profile_table = table_name 'common_member_profile' + status_table = table_name 'common_member_status' + total_count = mysql_query("SELECT count(*) count FROM #{user_table};").first['count'] + + batches(BATCH_SIZE) do |offset| + results = mysql_query( + "SELECT u.uid id, u.username username, u.email email, u.adminid admin_id, su.regdate regdate, s.regip regip, + u.emailstatus email_confirmed, u.avatarstatus avatar_exists, p.site website, p.resideprovince province, + p.residecity city, p.residedist country, p.residecommunity community, p.residesuite apartment, + p.bio bio, s.lastip last_visit_ip, s.lastvisit last_visit_time, s.lastpost last_posted_at, + s.lastsendmail last_emailed_at + FROM #{user_table} u + JOIN #{sensitive_user_table} su ON su.uid = u.uid + JOIN #{profile_table} p ON p.uid = u.uid + JOIN #{status_table} s ON s.uid = u.uid + ORDER BY u.uid ASC + LIMIT #{BATCH_SIZE} + OFFSET #{offset};") + + break if results.size < 1 + + create_users(results, total: total_count, offset: offset) do |user| + { id: user['id'], + email: user['email'], + username: user['username'], + name: user['username'], + created_at: Time.zone.at(user['regdate']), + registration_ip_address: user['regip'], + ip_address: user['last_visit_ip'], + last_seen_at: user['last_visit_time'], + last_emailed_at: user['last_emailed_at'], + last_posted_at: user['last_posted_at'], + moderator: user['admin_id'] == @moderator_admin_id, + admin: user['admin_id'] == @admin_admin_id, + active: true, + website: user['website'], + bio_raw: user['bio'], + location: "#{user['province']}#{user['city']}#{user['country']}#{user['community']}#{user['apartment']}", + post_create_action: lambda do |newmember| + if user['avatar_exists'] == 1 and newmember.uploaded_avatar_id.blank? + path, filename = discuzx_avatar_fullpath(user['id']) + if path + begin + upload = create_upload(newmember.id, path, filename) + if upload.persisted? + newmember.import_mode = false + newmember.create_user_avatar + newmember.import_mode = true + newmember.user_avatar.update(custom_upload_id: upload.id) + newmember.update(uploaded_avatar_id: upload.id) + else + puts "Error: Upload did not persist!" + end + rescue SystemCallError => err + puts "Could not import avatar: #{err.message}" + end + end + end + + # we don't send email to the unconfirmed user + newmember.update(email_digests: user['email_confirmed'] == 1) if newmember.email_digests + end + } + end + end + end + + def import_categories + puts '', "creating categories" + + forums_table = table_name 'forum_forum' + forums_data_table = table_name 'forum_forumfield' + + results = mysql_query(" + SELECT f.fid id, f.fup parent_id, f.name, f.type type, f.status status, f.displayorder position, + d.description description + FROM #{forums_table} f + JOIN #{forums_data_table} d ON f.fid = d.fid + ORDER BY parent_id ASC, id ASC + ") + + max_position = Category.all.max_by(&:position).position + create_categories(results) do |row| + next if row['type'] == 'group' || row['status'].to_i == 3 + + Category.all.max_by(&:position).position + h = { + id: row['id'], + name: row['name'], + description: row['description'], + position: row['position'].to_i + max_position + } + if row['parent_id'].to_i > 0 + parent = category_from_imported_category_id(row['parent_id']) + h[:parent_category_id] = parent.id if parent + end + h + end + end + + def import_posts + puts "", "creating topics and posts" + + posts_table = table_name 'forum_post' + topics_table = table_name 'forum_thread' + + total_count = mysql_query("SELECT count(*) count FROM #{posts_table}").first['count'] + + batches(BATCH_SIZE) do |offset| + results = mysql_query(" + SELECT p.pid id, + p.tid topic_id, + t.fid category_id, + t.subject title, + p.authorid user_id, + p.message raw, + p.dateline post_time, + p.first is_first_post, + p.invisible status + FROM #{posts_table} p, + #{topics_table} t + WHERE p.tid = t.tid + ORDER BY id ASC, topic_id ASC + LIMIT #{BATCH_SIZE} + OFFSET #{offset}; + ") + + break if results.size < 1 + + create_posts(results, total: total_count, offset: offset) do |m| + skip = false + mapped = {} + + mapped[:id] = m['id'] + mapped[:user_id] = user_id_from_imported_user_id(m['user_id']) || -1 + mapped[:raw] = process_discuzx_post(m['raw'], m['id']) + mapped[:created_at] = Time.zone.at(m['post_time']) + + if m['is_first_post'] == 1 + mapped[:category] = category_from_imported_category_id(m['category_id']).try(:name) + mapped[:title] = CGI.unescapeHTML(m['title']) + @first_post_id_by_topic_id[m['topic_id']] = m['id'] + else + parent = topic_lookup_from_imported_post_id(@first_post_id_by_topic_id[m['topic_id']]) + + if parent + mapped[:topic_id] = parent[:topic_id] + post_id = post_id_from_imported_post_id(find_post_id_by_quote_number(m['raw']).to_i) + if (post = Post.find_by(id: post_id)) + mapped[:reply_to_post_number] = post.post_number + end + else + puts "Parent topic #{m['topic_id']} doesn't exist. Skipping #{m['id']}: #{m['title'][0..40]}" + skip = true + end + end + + if [-5, -3, -1].include? m['status'] || mapped[:raw].blank? + mapped[:post_create_action] = lambda do |post| + PostDestroyer.new(Discourse.system_user, post).perform_delete + end + elsif m['status'] == -2# waiting for approve + mapped[:post_create_action] = lambda do |post| + PostAction.act(Discourse.system_user, post, 6, {take_action: false}) + end + end + + skip ? nil : mapped + end + end + end + + def import_private_messages + puts '', 'creating private messages' + + pm_indexes = table_name 'ucenter_pm_indexes' + pm_messages = table_name 'ucenter_pm_messages' + total_count = mysql_query("SELECT count(*) count FROM #{pm_indexes}").first['count'] + + batches(BATCH_SIZE) do |offset| + results = mysql_query(" + SELECT pmid id, plid thread_id, authorid user_id, message, dateline created_at + FROM #{pm_messages}_1 + UNION SELECT pmid id, plid thread_id, authorid user_id, message, dateline created_at + FROM #{pm_messages}_2 + UNION SELECT pmid id, plid thread_id, authorid user_id, message, dateline created_at + FROM #{pm_messages}_3 + UNION SELECT pmid id, plid thread_id, authorid user_id, message, dateline created_at + FROM #{pm_messages}_4 + UNION SELECT pmid id, plid thread_id, authorid user_id, message, dateline created_at + FROM #{pm_messages}_5 + UNION SELECT pmid id, plid thread_id, authorid user_id, message, dateline created_at + FROM #{pm_messages}_6 + UNION SELECT pmid id, plid thread_id, authorid user_id, message, dateline created_at + FROM #{pm_messages}_7 + UNION SELECT pmid id, plid thread_id, authorid user_id, message, dateline created_at + FROM #{pm_messages}_8 + UNION SELECT pmid id, plid thread_id, authorid user_id, message, dateline created_at + FROM #{pm_messages}_9 + ORDER BY thread_id ASC, id ASC + LIMIT #{BATCH_SIZE} + OFFSET #{offset};") + + break if results.size < 1 + + create_posts(results, total: total_count, offset: offset) do |m| + skip = false + mapped = {} + + mapped[:id] = "pm:#{m['id']}" + mapped[:user_id] = user_id_from_imported_user_id(m['user_id']) || -1 + mapped[:raw] = process_discuzx_post(m['message'], m['id']) + mapped[:created_at] = Time.zone.at(m['created_at']) + thread_id = "pm_#{m['thread_id']}" + + if is_first_pm(m['id'], m['thread_id']) + # find the title from list table + pm_thread = mysql_query(" + SELECT plid thread_id, subject + FROM #{table_name 'ucenter_pm_lists'} + WHERE plid = #{m['thread_id']};").first + mapped[:title] = pm_thread['subject'] + mapped[:archetype] = Archetype.private_message + + # Find the users who are part of this private message. + import_user_ids = mysql_query(" + SELECT plid thread_id, uid user_id + FROM #{table_name 'ucenter_pm_members'} + WHERE plid = #{m['thread_id']}; + ").map {|r| r['user_id']}.uniq + + mapped[:target_usernames] = import_user_ids.map! do |import_user_id| + import_user_id.to_s == m['user_id'].to_s ? nil : User.find_by(id: user_id_from_imported_user_id(import_user_id)).try(:username) + end.compact + + if mapped[:target_usernames].empty? # pm with yourself? + skip = true + puts "Skipping pm:#{m['id']} due to no target" + else + @first_post_id_by_topic_id[thread_id] = mapped[:id] + end + else + parent = topic_lookup_from_imported_post_id(@first_post_id_by_topic_id[thread_id]) + if parent + mapped[:topic_id] = parent[:topic_id] + else + puts "Parent post pm thread:#{thread_id} doesn't exist. Skipping #{m["id"]}: #{m["message"][0..40]}" + skip = true + end + end + + skip ? nil : mapped + end + + end + end + + # search for first pm id for the series of pm + def is_first_pm(pm_id, thread_id) + result = mysql_query(" + SELECT pmid id + FROM #{table_name 'ucenter_pm_indexes'} + WHERE plid = #{thread_id} + ORDER BY id") + result.first['id'].to_s == pm_id.to_s + end + + def process_discuzx_post(raw, import_id) + inline_image_regex = /\[img\]([\s\S]*?)\[\/img\]/ + s = raw.dup + + s.gsub!(inline_image_regex) do |d| + matches = inline_image_regex.match(d) + data = matches[1] + + upload, filename = upload_inline_image data + upload ? html_for_upload(upload, filename) : nil + end + + # Strip the quote + # [quote] quotation includes the topic which is the same as reply to in Discourse + # We get the pid to find the post number the post reply to. So it can be stripped + s = s.gsub(/\[quote\][\s\S]*?\[\/quote\]/i, '').strip + + # Remove the font tag + # Discourse doesn't support the font tag + s.gsub!(/\[font=[^ \t\r\n\f\]]*?\]/i, '') + s.gsub!(/\[\/font\]/i, '') + + # Remove the size tag + # I really have no idea what is this + s.gsub!(/\[size=[^ \t\r\n\f\]]*?\]/i, '') + s.gsub!(/\[\/size\]/i, '') + + # Remove the color tag + s.gsub!(/\[color=[^ \t\r\n\f\]]*?\]/i, '') + s.gsub!(/\[\/color\]/i, '') + + # Remove the align tag + # still don't know what it is + s.gsub!(/\[align=[^ \t\r\n\f\]]*?\]/i, '') + s.gsub!(/\[\/align\]/i, "\n") + + # Convert code + s.gsub!(/\[code\]/i, "\n```\n") + s.gsub!(/\[\/code\]/i, "\n```\n") + + # The edit notice should be removed + # example: 本帖最后由 Helloworld 于 2015-1-28 22:05 编辑 + s.gsub!(/\[i=s\] 本帖最后由[\s\S]*?编辑 \[\/i\]/, '') + + # Convert the custom smileys to emojis + # `{:cry:}` to `:cry` + s.gsub!(/\{(\:\S*?\:)\}/, '\1') + + # Replace internal forum links that aren't in the format + # convert list tags to ul and list=1 tags to ol + # (basically, we're only missing list=a here...) + s.gsub!(/\[list\](.*?)\[\/list:u\]/m, '[ul]\1[/ul]') + s.gsub!(/\[list=1\](.*?)\[\/list:o\]/m, '[ol]\1[/ol]') + # convert *-tags to li-tags so bbcode-to-md can do its magic on phpBB's lists: + s.gsub!(/\[\*\](.*?)\[\/\*:m\]/, '[li]\1[/li]') + + # Discuz can create PM out of a post, which will generates like + # [url=http://example.com/forum.php?mod=redirect&goto=findpost&pid=111&ptid=11][b]关于您在“主题名称”的帖子[/b][/url] + s.gsub!(pm_url_regexp) do |discuzx_link| + replace_internal_link(discuzx_link, $1) + end + + # [url][b]text[/b][/url] to **[url]text[/url]** + s.gsub!(/(\[url=[^\[\]]*?\])\[b\](\S*)\[\/b\](\[\/url\])/, '**\1\2\3**') + + s.gsub!(internal_url_regexp) do |discuzx_link| + replace_internal_link(discuzx_link, $1) + end + + # @someone without the url + s.gsub!(/@\[url=[^\[\]]*?\](\S*)\[\/url\]/i, '@\1') + + s.strip + end + + def replace_internal_link(discuzx_link, import_topic_id) + results = mysql_query("SELECT pid + FROM #{table_name 'forum_post'} + WHERE tid = #{import_topic_id} + ORDER BY pid ASC + LIMIT 1") + + return discuzx_link unless results.size > 0 + + linked_topic_id = results.first['pid'] + lookup = topic_lookup_from_imported_post_id(linked_topic_id) + + return discuzx_link unless lookup + + if (t = Topic.find_by(id: lookup[:topic_id])) + "#{NEW_SITE_PREFIX}/t/#{t.slug}/#{t.id}" + else + discuzx_link + end + end + + def internal_url_regexp + @internal_url_regexp ||= Regexp.new("http(?:s)?://#{ORIGINAL_SITE_PREFIX.gsub('.', '\.')}/forum\\.php\\?mod=viewthread&tid=(\\d+)(?:[^\\]\\[]*)") + end + + def pm_url_regexp + @pm_url_regexp ||= Regexp.new("http(?:s)?://#{ORIGINAL_SITE_PREFIX.gsub('.', '\.')}/forum\\.php\\?mod=redirect&goto=findpost&pid=\\d+&ptid=(\\d+)") + end + + # This step is done separately because it can take multiple attempts to get right (because of + # missing files, wrong paths, authorized extensions, etc.). + def import_attachments + setting = AUTHORIZED_EXTENSIONS.join('|') + SiteSetting.authorized_extensions = setting if setting != SiteSetting.authorized_extensions + + attachment_regex = /\[attach\](\d+)\[\/attach\]/ + + user = Discourse.system_user + + current_count = 0 + total_count = mysql_query("SELECT count(*) count FROM #{table_name 'forum_post'};").first['count'] + + success_count = 0 + fail_count = 0 + + puts '', "Importing attachments...", '' + + Post.find_each do |post| + current_count += 1 + print_status current_count, total_count + + new_raw = post.raw.dup + new_raw.gsub!(attachment_regex) do |s| + matches = attachment_regex.match(s) + attachment_id = matches[1] + + upload, filename = find_upload(user, post, attachment_id) + unless upload + fail_count += 1 + next + end + + html_for_upload(upload, filename) + end + + if new_raw != post.raw + PostRevisor.new(post).revise!(post.user, { raw: new_raw }, { bypass_bump: true, edit_reason: '从 Discuz 中导入附件' }) + end + + success_count += 1 + end + + puts '', '' + puts "succeeded: #{success_count}" + puts " failed: #{fail_count}" if fail_count > 0 + puts '' + end + + # Create the full path to the discuz avatar specified from user id + def discuzx_avatar_fullpath(user_id) + padded_id = user_id.to_s.rjust(9, '0') + + part_1 = padded_id[0..2] + part_2 = padded_id[3..4] + part_3 = padded_id[5..6] + part_4 = padded_id[-2..-1] + file_name = "#{part_4}_avatar_big.jpg" + + return File.join(DISCUZX_BASE_DIR, AVATAR_DIR, part_1, part_2, part_3, file_name), file_name + end + + # post id is in the quote block + def find_post_id_by_quote_number(raw) + s = raw.dup + quote_post_id = s.match(/\[quote\][\S\s]*pid=(\d+)[\S\s]*\[\/quote\]/) + + quote_post_id ? quote_post_id[1] : nil + end + + # for some reason, discuz inlined some png file + # the corresponding image stored is broken in a way + def upload_inline_image(data) + return unless data + + puts 'Creating inline image' + + encoded_photo = data['data:image/png;base64,'.length .. -1] + if encoded_photo + raw_file = Base64.decode64(encoded_photo) + else + puts 'Error parsed inline photo', data[0..20] + return + end + + real_filename = "#{SecureRandom.hex}.png" + filename = Tempfile.new(['inline', '.png']) + begin + filename.binmode + filename.write(raw_file) + filename.rewind + + upload = create_upload(Discourse::SYSTEM_USER_ID, filename, real_filename) + ensure + filename.close rescue nil + filename.unlink rescue nil + end + + if upload.nil? || !upload.valid? + puts "Upload not valid :(" + puts upload.errors.inspect if upload + return nil + end + + return upload, real_filename + end + + # find the uploaded file and real name from the db + def find_upload(user, post, upload_id) + attachment_table = table_name 'forum_attachment' + # search for table id + sql = "SELECT a.pid post_id, + a.aid upload_id, + a.tableid table_id + FROM #{attachment_table} a + WHERE a.pid = #{post.custom_fields['import_id']} + AND a.aid = #{upload_id};" + results = mysql_query(sql) + + unless (meta_data = results.first) + puts "Couldn't find forum_attachment record meta data for post.id = #{post.id}, import_id = #{post.custom_fields['import_id']}" + return nil + end + + # search for uploaded file meta data + sql = "SELECT a.pid post_id, + a.aid upload_id, + a.tid topic_id, + a.uid user_id, + a.dateline uploaded_time, + a.filename real_filename, + a.attachment attachment_path, + a.remote is_remote, + a.description description, + a.isimage is_image, + a.thumb is_thumb + FROM #{attachment_table}_#{meta_data['table_id']} a + WHERE a.aid = #{upload_id};" + results = mysql_query(sql) + + unless (row = results.first) + puts "Couldn't find attachment record for post.id = #{post.id}, import_id = #{post.custom_fields['import_id']}" + return nil + end + + filename = File.join(DISCUZX_BASE_DIR, ATTACHMENT_DIR, row['attachment_path']) + unless File.exists?(filename) + puts "Attachment file doesn't exist: #{filename}" + return nil + end + real_filename = row['real_filename'] + real_filename.prepend SecureRandom.hex if real_filename[0] == '.' + upload = create_upload(user.id, filename, real_filename) + + if upload.nil? || !upload.valid? + puts "Upload not valid :(" + puts upload.errors.inspect if upload + return nil + end + + return upload, real_filename + rescue Mysql2::Error => e + puts "SQL Error" + puts e.message + puts sql + return nil + end + + def mysql_query(sql) + @client.query(sql, cache_rows: false) + end +end + +ImportScripts::DiscuzX.new.perform