2014-08-18 13:04:08 +02:00
require File . expand_path ( File . dirname ( __FILE__ ) + " /base.rb " )
2014-12-22 13:22:16 +01:00
require 'mysql2'
2015-01-19 15:00:55 +01:00
require 'htmlentities'
2014-08-18 13:04:08 +02:00
class ImportScripts :: VBulletin < ImportScripts :: Base
2015-01-19 15:00:55 +01:00
BATCH_SIZE = 1000
2014-08-18 13:04:08 +02:00
2015-01-19 15:00:55 +01:00
# CHANGE THESE BEFORE RUNNING THE IMPORTER
2014-12-22 13:22:16 +01:00
DATABASE = " iref "
2015-01-19 15:00:55 +01:00
TIMEZONE = " Asia/Kolkata "
2014-08-18 13:04:08 +02:00
2014-12-22 13:22:16 +01:00
def initialize
super
2014-08-18 13:04:08 +02:00
2015-01-26 20:35:30 +01:00
@old_username_to_new_usernames = { }
2015-01-19 15:00:55 +01:00
@tz = TZInfo :: Timezone . get ( TIMEZONE )
@htmlentities = HTMLEntities . new
2014-12-22 13:22:16 +01:00
@client = Mysql2 :: Client . new (
host : " localhost " ,
username : " root " ,
database : DATABASE
)
2014-08-18 13:04:08 +02:00
end
def execute
import_groups
import_users
import_categories
import_topics
import_posts
2014-09-04 17:55:05 +02:00
close_topics
2015-01-19 15:00:55 +01:00
post_process_posts
2014-08-18 13:04:08 +02:00
end
2014-12-22 13:22:16 +01:00
def import_groups
puts " " , " importing groups... "
2014-08-18 13:04:08 +02:00
2014-12-22 13:22:16 +01:00
groups = mysql_query <<-SQL
SELECT usergroupid , title
FROM usergroup
ORDER BY usergroupid
SQL
2014-08-18 13:04:08 +02:00
2014-12-22 13:22:16 +01:00
create_groups ( groups ) do | group |
{
2015-01-19 15:00:55 +01:00
id : group [ " usergroupid " ] ,
name : @htmlentities . decode ( group [ " title " ] ) . strip
2014-12-22 13:22:16 +01:00
}
2014-08-18 13:04:08 +02:00
end
2014-12-22 13:22:16 +01:00
end
2014-08-18 13:04:08 +02:00
2014-12-22 13:22:16 +01:00
def import_users
puts " " , " importing users "
2014-08-18 13:04:08 +02:00
2014-12-22 13:22:16 +01:00
user_count = mysql_query ( " SELECT COUNT(userid) count FROM user " ) . first [ " count " ]
2014-08-18 13:04:08 +02:00
2015-01-19 15:00:55 +01:00
# TODO: add email back in when using real data
2014-12-22 13:22:16 +01:00
batches ( BATCH_SIZE ) do | offset |
users = mysql_query <<-SQL
SELECT userid , username , homepage , usertitle , usergroupid , joindate
FROM user
ORDER BY userid
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
2014-08-18 13:04:08 +02:00
2014-12-22 13:22:16 +01:00
break if users . size < 1
2014-08-25 10:48:29 +02:00
2014-12-22 13:22:16 +01:00
create_users ( users , total : user_count , offset : offset ) do | user |
2015-01-19 15:00:55 +01:00
username = @htmlentities . decode ( user [ " username " ] ) . strip
2014-08-18 13:04:08 +02:00
{
2015-01-19 15:00:55 +01:00
id : user [ " userid " ] ,
name : username ,
username : username ,
2014-12-22 13:22:16 +01:00
email : user [ " email " ] . presence || fake_email ,
2015-01-19 15:00:55 +01:00
website : user [ " homepage " ] . strip ,
title : @htmlentities . decode ( user [ " usertitle " ] ) . strip ,
2014-12-22 13:22:16 +01:00
primary_group_id : group_id_from_imported_group_id ( user [ " usergroupid " ] ) ,
2015-01-19 15:00:55 +01:00
created_at : parse_timestamp ( user [ " joindate " ] ) ,
2014-12-22 13:22:16 +01:00
post_create_action : proc do | u |
@old_username_to_new_usernames [ user [ " username " ] ] = u . username
2015-01-19 15:00:55 +01:00
import_profile_picture ( user , u )
import_profile_background ( user , u )
2014-12-22 13:22:16 +01:00
end
2014-08-18 13:04:08 +02:00
}
end
end
2014-12-22 13:22:16 +01:00
end
2014-08-18 13:04:08 +02:00
2015-01-19 15:00:55 +01:00
def import_profile_picture ( old_user , imported_user )
query = mysql_query <<-SQL
SELECT filedata , filename
FROM customavatar
WHERE userid = #{old_user["userid"]}
ORDER BY dateline DESC
LIMIT 1
SQL
picture = query . first
return if picture . nil?
file = Tempfile . new ( " profile-picture " )
file . write ( picture [ " filedata " ] . encode ( " ASCII-8BIT " ) . force_encoding ( " UTF-8 " ) )
file . rewind
upload = Upload . create_for ( imported_user . id , file , picture [ " filename " ] , file . size )
return if ! upload . persisted?
imported_user . create_user_avatar
imported_user . user_avatar . update ( custom_upload_id : upload . id )
imported_user . update ( uploaded_avatar_id : upload . id )
ensure
file . close rescue nil
file . unlind rescue nil
end
def import_profile_background ( old_user , imported_user )
query = mysql_query <<-SQL
SELECT filedata , filename
FROM customprofilepic
WHERE userid = #{old_user["userid"]}
ORDER BY dateline DESC
LIMIT 1
SQL
background = query . first
return if background . nil?
file = Tempfile . new ( " profile-background " )
file . write ( background [ " filedata " ] . encode ( " ASCII-8BIT " ) . force_encoding ( " UTF-8 " ) )
file . rewind
upload = Upload . create_for ( imported_user . id , file , background [ " filename " ] , file . size )
return if ! upload . persisted?
imported_user . user_profile . update ( profile_background : upload . url )
ensure
file . close rescue nil
file . unlink rescue nil
end
2014-12-22 13:22:16 +01:00
def import_categories
puts " " , " importing top level categories... "
2015-01-19 15:00:55 +01:00
categories = mysql_query ( " SELECT forumid, title, description, displayorder, parentid FROM forum ORDER BY forumid " ) . to_a
2014-12-22 13:22:16 +01:00
2015-01-19 15:00:55 +01:00
top_level_categories = categories . select { | c | c [ " parentid " ] == - 1 }
2014-12-22 13:22:16 +01:00
create_categories ( top_level_categories ) do | category |
{
2015-01-19 15:00:55 +01:00
id : category [ " forumid " ] ,
name : @htmlentities . decode ( category [ " title " ] ) . strip ,
position : category [ " displayorder " ] ,
description : @htmlentities . decode ( category [ " description " ] ) . strip
2014-12-22 13:22:16 +01:00
}
2014-08-18 13:04:08 +02:00
end
2014-12-22 13:22:16 +01:00
puts " " , " importing children categories... "
2015-01-19 15:00:55 +01:00
children_categories = categories . select { | c | c [ " parentid " ] != - 1 }
top_level_category_ids = Set . new ( top_level_categories . map { | c | c [ " forumid " ] } )
2014-12-22 13:22:16 +01:00
2015-01-19 15:00:55 +01:00
# cut down the tree to only 2 levels of categories
children_categories . each do | cc |
while ! top_level_category_ids . include? ( cc [ " parentid " ] )
cc [ " parentid " ] = categories . detect { | c | c [ " forumid " ] == cc [ " parentid " ] } [ " parentid " ]
end
end
create_categories ( children_categories ) do | category |
2014-12-22 13:22:16 +01:00
{
2015-01-19 15:00:55 +01:00
id : category [ " forumid " ] ,
name : @htmlentities . decode ( category [ " title " ] ) . strip ,
position : category [ " displayorder " ] ,
description : @htmlentities . decode ( category [ " description " ] ) . strip ,
parent_category_id : category_from_imported_category_id ( category [ " parentid " ] ) . try ( :[] , " id " )
2014-12-22 13:22:16 +01:00
}
2014-08-18 13:04:08 +02:00
end
2014-12-22 13:22:16 +01:00
end
2014-08-18 13:04:08 +02:00
2014-12-22 13:22:16 +01:00
def import_topics
puts " " , " importing topics... "
2014-09-04 17:55:05 +02:00
2014-12-22 13:22:16 +01:00
# keep track of closed topics
@closed_topic_ids = [ ]
2014-08-25 10:48:29 +02:00
2014-12-22 13:22:16 +01:00
topic_count = mysql_query ( " SELECT COUNT(threadid) count FROM thread " ) . first [ " count " ]
2014-09-04 17:55:05 +02:00
2014-12-22 13:22:16 +01:00
batches ( BATCH_SIZE ) do | offset |
topics = mysql_query <<-SQL
SELECT t . threadid threadid , t . title title , forumid , open , postuserid , t . dateline dateline , views , t . visible visible , sticky ,
p . pagetext raw
FROM thread t
JOIN post p ON p . postid = t . firstpostid
ORDER BY t . threadid
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
2014-09-04 17:55:05 +02:00
2014-12-22 13:22:16 +01:00
break if topics . size < 1
2014-08-18 13:04:08 +02:00
2014-12-22 13:22:16 +01:00
create_posts ( topics , total : topic_count , offset : offset ) do | topic |
2015-01-21 09:36:46 +01:00
raw = preprocess_post_raw ( topic [ " raw " ] ) rescue nil
next if raw . blank?
2014-12-22 13:22:16 +01:00
topic_id = " thread- #{ topic [ " threadid " ] } "
@closed_topic_ids << topic_id if topic [ " open " ] == " 0 "
2014-08-18 13:04:08 +02:00
t = {
2014-12-22 13:22:16 +01:00
id : topic_id ,
2015-01-19 15:00:55 +01:00
user_id : user_id_from_imported_user_id ( topic [ " postuserid " ] ) || Discourse :: SYSTEM_USER_ID ,
title : @htmlentities . decode ( topic [ " title " ] ) . strip [ 0 ... 255 ] ,
category : category_from_imported_category_id ( topic [ " forumid " ] ) . try ( :name ) ,
2015-01-21 09:36:46 +01:00
raw : raw ,
2015-01-19 15:00:55 +01:00
created_at : parse_timestamp ( topic [ " dateline " ] ) ,
2014-12-22 13:22:16 +01:00
visible : topic [ " visible " ] . to_i == 1 ,
2015-01-19 15:00:55 +01:00
views : topic [ " views " ] ,
2014-08-18 13:04:08 +02:00
}
2014-12-22 13:22:16 +01:00
t [ :pinned_at ] = t [ :created_at ] if topic [ " sticky " ] . to_i == 1
2014-08-18 13:04:08 +02:00
t
end
end
2014-12-22 13:22:16 +01:00
end
2014-08-18 13:04:08 +02:00
2014-12-22 13:22:16 +01:00
def import_posts
puts " " , " importing posts... "
2014-08-18 13:04:08 +02:00
2015-01-21 09:36:46 +01:00
# make sure `firstpostid` is indexed
mysql_query ( " CREATE INDEX firstpostid_index ON thread (firstpostid) " )
2014-12-22 13:22:16 +01:00
post_count = mysql_query ( " SELECT COUNT(postid) count FROM post WHERE postid NOT IN (SELECT firstpostid FROM thread) " ) . first [ " count " ]
2014-08-18 13:04:08 +02:00
2014-12-22 13:22:16 +01:00
batches ( BATCH_SIZE ) do | offset |
posts = mysql_query <<-SQL
SELECT postid , userid , threadid , pagetext raw , dateline , visible , parentid
FROM post
WHERE postid NOT IN ( SELECT firstpostid FROM thread )
ORDER BY postid
LIMIT #{BATCH_SIZE}
OFFSET #{offset}
SQL
2014-08-25 10:48:29 +02:00
2014-12-22 13:22:16 +01:00
break if posts . size < 1
2014-08-18 13:04:08 +02:00
2014-12-22 13:22:16 +01:00
create_posts ( posts , total : post_count , offset : offset ) do | post |
2015-01-21 09:36:46 +01:00
raw = preprocess_post_raw ( post [ " raw " ] ) rescue nil
next if raw . blank?
2014-12-22 13:22:16 +01:00
next unless topic = topic_lookup_from_imported_post_id ( " thread- #{ post [ " threadid " ] } " )
2014-08-18 13:04:08 +02:00
p = {
2015-01-19 15:00:55 +01:00
id : post [ " postid " ] ,
2014-12-22 13:22:16 +01:00
user_id : user_id_from_imported_user_id ( post [ " userid " ] ) || Discourse :: SYSTEM_USER_ID ,
topic_id : topic [ :topic_id ] ,
2015-01-21 09:36:46 +01:00
raw : raw ,
2015-01-19 15:00:55 +01:00
created_at : parse_timestamp ( post [ " dateline " ] ) ,
2014-12-22 13:22:16 +01:00
hidden : post [ " visible " ] . to_i == 0 ,
2014-08-18 13:04:08 +02:00
}
2014-12-22 13:22:16 +01:00
if parent = topic_lookup_from_imported_post_id ( post [ " parentid " ] )
2014-08-18 13:04:08 +02:00
p [ :reply_to_post_number ] = parent [ :post_number ]
end
p
end
end
2014-12-22 13:22:16 +01:00
end
2014-08-18 13:04:08 +02:00
2014-12-22 13:22:16 +01:00
def close_topics
puts " " , " Closing topics... "
sql = <<-SQL
WITH closed_topic_ids AS (
SELECT t . id AS topic_id
FROM post_custom_fields pcf
JOIN posts p ON p . id = pcf . post_id
JOIN topics t ON t . id = p . topic_id
WHERE pcf . name = 'import_id'
AND pcf . value IN ( ?)
)
UPDATE topics
SET closed = true
WHERE id IN ( SELECT topic_id FROM closed_topic_ids )
SQL
Topic . exec_sql ( sql , @closed_topic_ids )
end
2014-08-25 10:48:29 +02:00
2015-01-19 15:00:55 +01:00
def post_process_posts
puts " " , " Postprocessing posts... "
current = 0
max = Post . count
Post . find_each do | post |
begin
new_raw = postprocess_post_raw ( post . raw )
if new_raw != post . raw
post . raw = new_raw
post . save
end
2015-01-26 20:35:30 +01:00
rescue PrettyText :: JavaScriptError
nil
2015-01-19 15:00:55 +01:00
ensure
print_status ( current += 1 , max )
end
end
end
2014-12-22 13:22:16 +01:00
def preprocess_post_raw ( raw )
return " " if raw . blank?
2014-08-18 13:04:08 +02:00
2015-01-19 15:00:55 +01:00
# decode HTML entities
raw = @htmlentities . decode ( raw )
# fix whitespaces
2014-12-22 13:22:16 +01:00
raw = raw . gsub ( / ( \\ r)? \\ n / , " \n " )
. gsub ( " \\ t " , " \t " )
2014-08-18 13:04:08 +02:00
2014-12-22 13:22:16 +01:00
# remove attachments
raw = raw . gsub ( / \ [attach[^ \ ]]* \ ] \ d+ \ [ \/ attach \ ] /i , " " )
2014-08-18 13:04:08 +02:00
2014-12-22 13:22:16 +01:00
# [HTML]...[/HTML]
raw = raw . gsub ( / \ [html \ ] /i , " \n ```html \n " )
. gsub ( / \ [ \/ html \ ] /i , " \n ``` \n " )
2014-08-18 13:04:08 +02:00
2014-12-22 13:22:16 +01:00
# [PHP]...[/PHP]
raw = raw . gsub ( / \ [php \ ] /i , " \n ```php \n " )
. gsub ( / \ [ \/ php \ ] /i , " \n ``` \n " )
2014-08-18 13:04:08 +02:00
2014-12-22 13:22:16 +01:00
# [HIGHLIGHT="..."]
raw = raw . gsub ( / \ [highlight="?( \ w+)"? \ ] /i ) { " \n ``` #{ $1 . downcase } \n " }
2014-08-18 13:04:08 +02:00
2014-12-22 13:22:16 +01:00
# [CODE]...[/CODE]
# [HIGHLIGHT]...[/HIGHLIGHT]
raw = raw . gsub ( / \ [ \/ ?code \ ] /i , " \n ``` \n " )
. gsub ( / \ [ \/ ?highlight \ ] /i , " \n ``` \n " )
2014-09-04 17:55:05 +02:00
2014-12-22 13:22:16 +01:00
# [SAMP]...[/SAMP]
raw = raw . gsub ( / \ [ \/ ?samp \ ] /i , " ` " )
2014-09-04 17:55:05 +02:00
2014-12-22 13:22:16 +01:00
# replace all chevrons with HTML entities
# NOTE: must be done
# - AFTER all the "code" processing
# - BEFORE the "quote" processing
raw = raw . gsub ( / `([^`]+)` /im ) { " ` " + $1 . gsub ( " < " , " \ u2603 " ) + " ` " }
. gsub ( " < " , " < " )
. gsub ( " \ u2603 " , " < " )
2014-08-18 13:04:08 +02:00
2014-12-22 13:22:16 +01:00
raw = raw . gsub ( / `([^`]+)` /im ) { " ` " + $1 . gsub ( " > " , " \ u2603 " ) + " ` " }
. gsub ( " > " , " > " )
. gsub ( " \ u2603 " , " > " )
2014-08-18 13:04:08 +02:00
2014-12-22 13:22:16 +01:00
# [URL=...]...[/URL]
raw = raw . gsub ( / \ [url="?(.+?)"? \ ](.+) \ [ \/ url \ ] /i ) { " [ #{ $2 } ]( #{ $1 } ) " }
2014-08-18 13:04:08 +02:00
2014-12-22 13:22:16 +01:00
# [URL]...[/URL]
# [MP3]...[/MP3]
raw = raw . gsub ( / \ [ \/ ?url \ ] /i , " " )
. gsub ( / \ [ \/ ?mp3 \ ] /i , " " )
2014-08-18 13:04:08 +02:00
2014-12-22 13:22:16 +01:00
# [MENTION]<username>[/MENTION]
raw = raw . gsub ( / \ [mention \ ](.+?) \ [ \/ mention \ ] /i ) do
old_username = $1
if @old_username_to_new_usernames . has_key? ( old_username )
old_username = @old_username_to_new_usernames [ old_username ]
2014-08-18 13:04:08 +02:00
end
2014-12-22 13:22:16 +01:00
" @ #{ old_username } "
end
2014-08-18 13:04:08 +02:00
2014-12-22 13:22:16 +01:00
# [MENTION=<user_id>]<username>[/MENTION]
# raw = raw.gsub(/\[mention="?(\d+)"?\](.+?)\[\/mention\]/i) do
# user_id, old_username = $1, $2
# if user = @users.select { |u| u[:userid] == user_id }.first
# old_username = @old_username_to_new_usernames[user[:username]] || user[:username]
# end
# "@#{old_username}"
# end
# [QUOTE]...[/QUOTE]
raw = raw . gsub ( / \ [quote \ ](.+?) \ [ \/ quote \ ] /im ) { " \n > #{ $1 } \n " }
# [QUOTE=<username>]...[/QUOTE]
raw = raw . gsub ( / \ [quote=([^; \ ]]+) \ ](.+?) \ [ \/ quote \ ] /im ) do
old_username , quote = $1 , $2
if @old_username_to_new_usernames . has_key? ( old_username )
old_username = @old_username_to_new_usernames [ old_username ]
2014-08-18 13:04:08 +02:00
end
2014-12-22 13:22:16 +01:00
" \n [quote= \" #{ old_username } \" ] \n #{ quote } \n [/quote] \n "
2014-08-18 13:04:08 +02:00
end
2014-12-22 13:22:16 +01:00
# [YOUTUBE]<id>[/YOUTUBE]
raw = raw . gsub ( / \ [youtube \ ](.+?) \ [ \/ youtube \ ] /i ) { " \n //youtu.be/ #{ $1 } \n " }
# [VIDEO=youtube;<id>]...[/VIDEO]
raw = raw . gsub ( / \ [video=youtube;([^ \ ]]+) \ ].*? \ [ \/ video \ ] /i ) { " \n //youtu.be/ #{ $1 } \n " }
raw
end
2015-01-19 15:00:55 +01:00
def postprocess_post_raw ( raw )
# [QUOTE=<username>;<post_id>]...[/QUOTE]
raw = raw . gsub ( / \ [quote=([^;]+);( \ d+) \ ](.+?) \ [ \/ quote \ ] /im ) do
old_username , post_id , quote = $1 , $2 , $3
if @old_username_to_new_usernames . has_key? ( old_username )
old_username = @old_username_to_new_usernames [ old_username ]
end
if topic_lookup = topic_lookup_from_imported_post_id ( post_id )
post_number = topic_lookup [ :post_number ]
topic_id = topic_lookup [ :topic_id ]
" \n [quote= \" #{ old_username } ,post: #{ post_number } ,topic: #{ topic_id } \" ] \n #{ quote } \n [/quote] \n "
else
" \n [quote= \" #{ old_username } \" ] \n #{ quote } \n [/quote] \n "
end
end
# [THREAD]<thread_id>[/THREAD]
# ==> http://my.discourse.org/t/slug/<topic_id>
raw = raw . gsub ( / \ [thread \ ]( \ d+) \ [ \/ thread \ ] /i ) do
thread_id = $1
if topic_lookup = topic_lookup_from_imported_post_id ( " thread- #{ thread_id } " )
topic_lookup [ :url ]
else
$&
end
end
# [THREAD=<thread_id>]...[/THREAD]
# ==> [...](http://my.discourse.org/t/slug/<topic_id>)
raw = raw . gsub ( / \ [thread=( \ d+) \ ](.+?) \ [ \/ thread \ ] /i ) do
thread_id , link = $1 , $2
if topic_lookup = topic_lookup_from_imported_post_id ( " thread- #{ thread_id } " )
url = topic_lookup [ :url ]
" [ #{ link } ]( #{ url } ) "
else
$&
end
end
# [POST]<post_id>[/POST]
# ==> http://my.discourse.org/t/slug/<topic_id>/<post_number>
raw = raw . gsub ( / \ [post \ ]( \ d+) \ [ \/ post \ ] /i ) do
post_id = $1
if topic_lookup = topic_lookup_from_imported_post_id ( post_id )
topic_lookup [ :url ]
else
$&
end
end
# [POST=<post_id>]...[/POST]
# ==> [...](http://my.discourse.org/t/<topic_slug>/<topic_id>/<post_number>)
raw = raw . gsub ( / \ [post=( \ d+) \ ](.+?) \ [ \/ post \ ] /i ) do
post_id , link = $1 , $2
if topic_lookup = topic_lookup_from_imported_post_id ( post_id )
url = topic_lookup [ :url ]
" [ #{ link } ]( #{ url } ) "
else
$&
end
end
raw
end
def parse_timestamp ( timestamp )
Time . zone . at ( @tz . utc_to_local ( timestamp ) )
end
2014-12-22 13:22:16 +01:00
def fake_email
SecureRandom . hex << " @domain.com "
end
def mysql_query ( sql )
@client . query ( sql , cache_rows : false )
end
2014-08-18 13:04:08 +02:00
end
2014-12-22 13:22:16 +01:00
ImportScripts :: VBulletin . new . perform