# custom importer for www.sfn.org, feel free to borrow ideas require "csv" require "mysql2" require File.expand_path(File.dirname(__FILE__) + "/base.rb") class ImportScripts::Sfn < ImportScripts::Base BATCH_SIZE = 100_000 MIN_CREATED_AT = "2003-11-01" def initialize super end def execute load_external_users import_users import_categories import_topics import_posts end def load_external_users puts "", "loading external users..." @personify_id_to_contact_key = {} contacts = mysql_query <<-SQL SELECT ContactKey AS "contact_key", PersonifyID AS "personify_id" FROM Contact SQL contacts.each do |contact| personify_id = contact["personify_id"].split(",").first @personify_id_to_contact_key[personify_id] = contact["contact_key"] end @external_users = {} CSV.foreach("/Users/zogstrip/Downloads/sfn.csv", col_sep: ";") do |row| next unless @personify_id_to_contact_key.include?(row[0]) id = @personify_id_to_contact_key[row[0]] full_name = [row[1].strip, row[2].strip, row[3].strip].join(" ").strip @external_users[id] = { email: row[4], full_name: full_name } end end def import_users puts "", "importing users..." user_count = mysql_query <<-SQL SELECT COUNT(ContactKey) AS "count" FROM Contact SQL user_count = user_count.first["count"] batches(BATCH_SIZE) do |offset| users = mysql_query <<-SQL SELECT c.ContactKey AS "id", c.Bio AS "bio", c.ProfileImage AS "avatar", es.EmailAddr_ AS "email", es.FullName_ AS "full_name", GREATEST('#{MIN_CREATED_AT}', COALESCE(cm.InvitedOn, '#{MIN_CREATED_AT}')) AS "created_at" FROM Contact c LEFT JOIN EgroupSubscription es ON es.ContactKey = c.ContactKey LEFT JOIN CommunityMember cm ON cm.ContactKey = c.ContactKey GROUP BY c.ContactKey ORDER BY cm.InvitedOn LIMIT #{BATCH_SIZE} OFFSET #{offset} SQL break if users.size < 1 create_users(users, total: user_count, offset: offset) do |user| external_user = @external_users[user["id"]] email = user["email"].presence || external_user.try(:[], :email) full_name = user["full_name"].presence || external_user.try(:[], :full_name) bio = (user["bio"] || "")[0..250] next if email.blank? { id: user["id"], email: email, name: full_name, bio_raw: bio, created_at: user["created_at"], post_create_action: proc do |newuser| next if user["avatar"].blank? avatar = Tempfile.new("sfn-avatar") avatar.write(user["avatar"].encode("ASCII-8BIT").force_encoding("UTF-8")) avatar.rewind upload = Upload.create_for(newuser.id, avatar, "avatar.jpg", avatar.size) if upload.persisted? newuser.create_user_avatar newuser.user_avatar.update(custom_upload_id: upload.id) newuser.update(uploaded_avatar_id: upload.id) end avatar.try(:close!) rescue nil end } end end end NEW_CATEGORIES = [ "Abstract Topic Matching Forum", "Animals in Research", "Brain Awareness and Teaching", "Career Advice", "Career Paths", "Diversity", "Early Career Policy Advocates", "LATP Associates", "LATP Fellows", "Mid and Advanced Career", "Neurobiology of Disease Workshop", "Neuronline Champions", "Neuroscience 2015", "Neuroscience Scholars Program", "NSP Associates", "NSP Fellows", "Outreach", "Postdocs and Early Career", "Program Committee", "Program Development", "Roommate Matching Forum", "Scientific Research", "Students", ] # EgroupKey => New Category Name CATEGORY_MAPPING = { "{DE10E4F4-621A-48BF-9B45-05D9F774A590}" => "Abstract Topic Matching Forum", "{3FFC1217-1576-4D38-BB81-D6CADC7FB793}" => "Animals in Research", "{9362BB21-BF6C-4E55-A3E0-18CD5D9F3323}" => "Brain Awareness and Teaching", "{3AC01B09-A21F-4166-95DA-0E585E271075}" => "Brain Awareness and Teaching", "{C249728D-8C9E-4138-AA49-D02467C28EAD}" => "Career Advice", "{01570B85-0124-478F-A8B9-B028BD1B1F2F}" => "Career Paths", "{2A430528-278A-46CD-BE1A-07CFA1122919}" => "Diversity", "{2F211345-3C19-43C9-90B5-27BA9FCD4DB0}" => "Diversity", "{8092297D-8DF4-404A-8BEB-4D5D0DC6A191}" => "Early Career Policy Advocates", "{8CB58762-D562-448C-9AF1-8DAE6C482C9B}" => "LATP Associates", "{CDF80A92-925A-46DD-A867-8558FA72D016}" => "LATP Fellows", "{E71E237B-7C23-4596-AECA-655BD8ED50DB}" => "Mid and Advanced Career", "{1D674C38-17CB-4C48-826A-D465AC3F8948}" => "Neurobiology of Disease Workshop", "{80C5835E-974E-4D44-BA01-C2C4F8BA91D7}" => "Neuronline Champions", "{3D4F885B-0037-403B-83DD-62FAA8E81DF1}" => "Neuroscience 2015", "{9ACC3B40-E4A3-4FFD-AADC-C8403EB6231D}" => "Neuroscience 2015", "{9FC30FFB-E450-4361-8844-0266C3D96868}" => "Neuroscience Scholars Program", "{3E78123E-87CE-435E-B4B7-7DAB1A21C541}" => "NSP Associates", "{12D889D3-5CFD-49D5-93E4-32AAB2CFFCDA}" => "NSP Fellows", "{FA86D79E-170E-4F53-8F1C-942CB3FFB19E}" => "Outreach", "{D7041C64-3D32-4010-B3D8-71858323CB4A}" => "Outreach", "{69B76913-4E23-4C80-A11E-9CDB4130722E}" => "Outreach", "{774878EA-96AD-49F5-9D29-105AEA488007}" => "Outreach", "{E6349704-FD01-41B1-9C59-68E928DD4318}" => "Postdocs and Early Career", "{31CF5944-2567-4E79-9730-18EEC23E5B52}" => "Postdocs and Early Career", "{5625C403-AFAE-4323-A470-33FC32B12B53}" => "Program Committee", "{8415D871-54F5-4128-B099-E5A376A6B41B}" => "Program Development", "{B4DF2044-47AB-4329-8BF7-0D832CAB402C}" => "Roommate Matching Forum", "{6A3A12B9-5C72-472F-97AC-F34983674960}" => "Scientific Research", "{2CF635E9-4866-451C-A4F2-E2A8A80FED54}" => "Scientific Research", "{CF2DDCCE-737F-499D-AFE4-E5C36F195C8B}" => "Scientific Research", "{282B48D7-AC1D-453E-9806-3C6CE6830EF9}" => "Scientific Research", "{6D750CAF-E96F-4AD1-A45B-7B74FDFF0B40}" => "Scientific Research", "{10AF5D45-BEB3-4F07-BE77-0BAB6910DE10}" => "Scientific Research", "{18D7F624-26D1-44B9-BF33-AB5C5A2AB2BF}" => "Scientific Research", "{6016FF4F-D834-4888-BA03-F9FE8CB1D4CC}" => "Scientific Research", "{B0290A37-EA39-4CB8-B6CB-3E0B7EF6D036}" => "Scientific Research", "{97CC60D0-B93A-43FF-BB48-366FAAEE2BAC}" => "Scientific Research", "{8FC9B57B-2755-4FC5-90E8-CCDB56CF2F66}" => "Scientific Research", "{57C8BF37-357E-4FE6-952D-906248642792}" => "Scientific Research", "{7B2A3B63-BC2C-4219-830C-BA1DECB33337}" => "Scientific Research", "{0ED1D205-0E48-48D2-B82B-3CE80C6C553F}" => "Scientific Research", "{10355962-D172-4294-AA8E-1BC381B67971}" => "Scientific Research", "{C84B0222-5232-4B94-9FB8-DDF802241171}" => "Scientific Research", "{9143F984-0D67-46CB-AAAF-7FE3B6335E07}" => "Scientific Research", "{1392DC10-37A0-46A6-9979-4568D0224C5F}" => "Scientific Research", "{E4891409-0F4F-4151-B550-ECE53655E231}" => "Scientific Research", "{9613BAC2-229B-4563-9E1C-35C31CDDCE2F}" => "Students", } def import_categories puts "", "importing categories..." create_categories(NEW_CATEGORIES) do |category| { id: category, name: category } end end def import_topics puts "", "importing topics..." topic_count = mysql_query <<-SQL SELECT COUNT(MessageID_) AS "count" FROM EgroupMessages WHERE ParentId_ = 0 AND ApprovedRejectedPendingInd = "Approved" AND (CrosspostFromMessageKey IS NULL OR CrosspostFromMessageKey = '{00000000-0000-0000-0000-000000000000}') SQL topic_count = topic_count.first["count"] batches(BATCH_SIZE) do |offset| topics = mysql_query <<-SQL SELECT MessageID_ AS "id", EgroupKey AS "category_id", ContactKey AS "user_id", HdrSubject_ AS "title", Body_ AS "raw", CreatStamp_ AS "created_at" FROM EgroupMessages WHERE ParentId_ = 0 AND ApprovedRejectedPendingInd = "Approved" AND (CrosspostFromMessageKey IS NULL OR CrosspostFromMessageKey = '{00000000-0000-0000-0000-000000000000}') ORDER BY CreatStamp_ LIMIT #{BATCH_SIZE} OFFSET #{offset} SQL break if topics.size < 1 create_posts(topics, total: topic_count, offset: offset) do |topic| next unless category_id = CATEGORY_MAPPING[topic["category_id"]] title = topic["title"][0..250] raw = cleanup_raw(topic["raw"]) next if raw.blank? { id: topic["id"], category: category_id_from_imported_category_id(category_id), user_id: user_id_from_imported_user_id(topic["user_id"]) || Discourse::SYSTEM_USER_ID, title: title, raw: raw, created_at: topic["created_at"], } end end end def import_posts puts "", "importing posts..." posts_count = mysql_query <<-SQL SELECT COUNT(MessageID_) AS "count" FROM EgroupMessages WHERE ParentId_ > 0 AND ApprovedRejectedPendingInd = "Approved" AND (CrosspostFromMessageKey IS NULL OR CrosspostFromMessageKey = '{00000000-0000-0000-0000-000000000000}') SQL posts_count = posts_count.first["count"] batches(BATCH_SIZE) do |offset| posts = mysql_query <<-SQL SELECT MessageID_ AS "id", ContactKey AS "user_id", ParentID_ AS "topic_id", Body_ AS "raw", CreatStamp_ AS "created_at" FROM EgroupMessages WHERE ParentId_ > 0 AND ApprovedRejectedPendingInd = "Approved" AND (CrosspostFromMessageKey IS NULL OR CrosspostFromMessageKey = '{00000000-0000-0000-0000-000000000000}') ORDER BY CreatStamp_ LIMIT #{BATCH_SIZE} OFFSET #{offset} SQL break if posts.size < 1 create_posts(posts, total: posts_count, offset: offset) do |post| next unless parent = topic_lookup_from_imported_post_id(post["topic_id"]) raw = cleanup_raw(post["raw"]) next if raw.blank? { id: post["id"], topic_id: parent[:topic_id], user_id: user_id_from_imported_user_id(post["user_id"]) || Discourse::SYSTEM_USER_ID, raw: cleanup_raw(post["raw"]), created_at: post["created_at"], } end end end def cleanup_raw(raw) # fix some html raw.gsub!(//i, "\n") # remove "This message has been cross posted to the following eGroups: ..." raw.gsub!(/^This message has been cross posted to the following eGroups: .+\n-{3,}/i, "") # remove signatures raw.gsub!(/-{3,}.+/m, "") # strip leading/trailing whitespaces raw.strip end def mysql_query(sql) @client ||= Mysql2::Client.new(username: "root", database: "sfn") @client.query(sql) end end ImportScripts::Sfn.new.perform