diff --git a/script/import_scripts/zoho.rb b/script/import_scripts/zoho.rb
index 0b9feaffe..b78b44bd7 100644
--- a/script/import_scripts/zoho.rb
+++ b/script/import_scripts/zoho.rb
@@ -127,8 +127,8 @@ class ImportScripts::Zoho < ImportScripts::Base
id: import_post_id(row),
user_id: user_id,
category: category_id,
- title: row.topic_title,
- raw: row.content,
+ title: CGI.unescapeHTML(row.topic_title),
+ raw: cleanup_post(row.content),
created_at: Time.zone.parse(row.posted_time)
}
# created_post callback will be called
@@ -136,7 +136,7 @@ class ImportScripts::Zoho < ImportScripts::Base
{
id: import_post_id(row),
user_id: user_id,
- raw: row.content,
+ raw: cleanup_post(row.content),
created_at: Time.zone.parse(row.posted_time),
topic_id: @topic_mapping[row.permalink]
}
@@ -155,6 +155,47 @@ class ImportScripts::Zoho < ImportScripts::Base
end
end
+ # Note that Zoho doesn't render code blocks the same way all the time,
+ # but this seems to catch the most common format:
+ ZOHO_CODE_BLOCK_START = /
/
+
+ TOO_MANY_LINE_BREAKS = /[\n ]{3,}/
+ STYLE_ATTR = /(\s)*style="(.)*"/
+
+ def cleanup_post(raw)
+
+ # Check if Zoho's most common form of a code block is present.
+ # If so, don't clean up the post as much because we can't tell which markup
+ # is inside the code block. These posts will look worse than others.
+ has_code_block = !!(raw =~ ZOHO_CODE_BLOCK_START)
+
+ x = raw.gsub(STYLE_ATTR, '')
+
+ if has_code_block
+ # We have to assume all lists in this post are meant to be code blocks
+ # to make it somewhat readable.
+ x.gsub!(/( )*(\s)*/, "")
+ x.gsub!(/( )*<\/ol>/, "")
+ x.gsub!('- ', '')
+ x.gsub!('
', '')
+ else
+ # No code block (probably...) so clean up more aggressively.
+ x.gsub!("\n", " ")
+ x.gsub!('', "\n\n")
+ x.gsub('
', ' ')
+ x.gsub!("
", "\n")
+ x.gsub!('', '')
+ x.gsub!('', '')
+ x.gsub!(/]*)>/, '')
+ x.gsub!('', '')
+ end
+
+ x.gsub!(TOO_MANY_LINE_BREAKS, "\n\n")
+
+ CGI.unescapeHTML(x)
+ end
+
+
def import_post_id(row)
# Try to make up a unique id based on the data Zoho gives us.
# The posted_time seems to be the same for all posts in a topic, so we can't use that.