discourse/lib/text_cleaner.rb

#
# Clean up a text
#

# Whe use ActiveSupport mb_chars from here to properly support non ascii downcase
require 'active_support/core_ext/string/multibyte'

class TextCleaner

  def self.title_options
    # cf. http://meta.discourse.org/t/should-we-have-auto-replace-rules-in-titles/5687
    {
      deduplicate_exclamation_marks: SiteSetting.title_prettify,
      deduplicate_question_marks: SiteSetting.title_prettify,
      replace_all_upper_case: SiteSetting.title_prettify,
      capitalize_first_letter: SiteSetting.title_prettify,
      remove_all_periods_from_the_end: SiteSetting.title_prettify,
      remove_extraneous_space: SiteSetting.title_prettify && SiteSetting.default_locale == "en",
      fixes_interior_spaces: true,
      strip_whitespaces: true
    }
  end

  def self.clean_title(title)
    TextCleaner.clean(title, TextCleaner.title_options)
  end

  def self.clean(text, opts = {})
    # Replace !!!!! with a single !
    text.gsub!(/!+/, '!') if opts[:deduplicate_exclamation_marks]
    # Replace ????? with a single ?
    text.gsub!(/\?+/, '?') if opts[:deduplicate_question_marks]
    # Replace all-caps text with regular case letters
    text = text.mb_chars.downcase.to_s if opts[:replace_all_upper_case] && (text =~ /[A-Z]+/) && (text == text.upcase)
    # Capitalize first letter, but only when entire first word is lowercase
    text = text.mb_chars.capitalize.to_s if opts[:capitalize_first_letter] && text.split(' ').first == text.split(' ').first.mb_chars.downcase
    # Remove unnecessary periods at the end
    text.sub!(/([^.])\.+(\s*)\z/, '\1\2') if opts[:remove_all_periods_from_the_end]
    # Remove extraneous space before the end punctuation
    text.sub!(/\s+([!?]\s*)\z/, '\1') if opts[:remove_extraneous_space]
    # Fixes interior spaces
    text.gsub!(/ +/, ' ') if opts[:fixes_interior_spaces]
    # Normalize whitespaces
    text = normalize_whitespaces(text)
    # Strip whitespaces
    text.strip! if opts[:strip_whitespaces]

    text
  end

  @@whitespaces_regexp = Regexp.new("(\u00A0|\u1680|\u180E|[\u2000-\u200A]|\u2028|\u2029|\u202F|\u205F|\u3000)", "u").freeze

  def self.normalize_whitespaces(text)
    text.gsub(@@whitespaces_regexp, ' ')
  end

end
auto replace rules in titles 2013-04-10 05:00:50 -04:00			`#`
			`# Clean up a text`
			`#`
FIX: Properly downcase unicode chars Fix to https://meta.discourse.org/t/title-prettify-does-not-correctly-lowercase-non-english-characters-when-removing-all-caps/16645 This adds a dependency on Active Support Multibyte to downcase on other languages. 2016-03-07 15:07:36 -05:00
			`# Whe use ActiveSupport mb_chars from here to properly support non ascii downcase`
			`require 'active_support/core_ext/string/multibyte'`

auto replace rules in titles 2013-04-10 05:00:50 -04:00			`class TextCleaner`

			`def self.title_options`
			`# cf. http://meta.discourse.org/t/should-we-have-auto-replace-rules-in-titles/5687`
			`{`
			`deduplicate_exclamation_marks: SiteSetting.title_prettify,`
			`deduplicate_question_marks: SiteSetting.title_prettify,`
			`replace_all_upper_case: SiteSetting.title_prettify,`
			`capitalize_first_letter: SiteSetting.title_prettify,`
Text Cleaner now removes all periods from the end of the title 2013-04-17 18:19:42 -04:00			`remove_all_periods_from_the_end: SiteSetting.title_prettify,`
auto replace rules in titles 2013-04-10 05:00:50 -04:00			`remove_extraneous_space: SiteSetting.title_prettify && SiteSetting.default_locale == "en",`
			`fixes_interior_spaces: true,`
			`strip_whitespaces: true`
			`}`
			`end`

			`def self.clean_title(title)`
			`TextCleaner.clean(title, TextCleaner.title_options)`
			`end`

			`def self.clean(text, opts = {})`
			`# Replace !!!!! with a single !`
			`text.gsub!(/!+/, '!') if opts[:deduplicate_exclamation_marks]`
			`# Replace ????? with a single ?`
			`text.gsub!(/\?+/, '?') if opts[:deduplicate_question_marks]`
			`# Replace all-caps text with regular case letters`
Handle Capitalize too Handles capitalize too, using the same lib 2016-03-07 20:09:30 -05:00			`text = text.mb_chars.downcase.to_s if opts[:replace_all_upper_case] && (text =~ /[A-Z]+/) && (text == text.upcase)`
Thread title fixer should ignore special cases like iLetter 2013-05-23 15:31:08 -04:00			`# Capitalize first letter, but only when entire first word is lowercase`
Handle Capitalize too Handles capitalize too, using the same lib 2016-03-07 20:09:30 -05:00			`text = text.mb_chars.capitalize.to_s if opts[:capitalize_first_letter] && text.split(' ').first == text.split(' ').first.mb_chars.downcase`
Text Cleaner now removes all periods from the end of the title 2013-04-17 18:19:42 -04:00			`# Remove unnecessary periods at the end`
			`text.sub!(/([^.])\.+(\s*)\z/, '\1\2') if opts[:remove_all_periods_from_the_end]`
auto replace rules in titles 2013-04-10 05:00:50 -04:00			`# Remove extraneous space before the end punctuation`
			`text.sub!(/\s+([!?]\s*)\z/, '\1') if opts[:remove_extraneous_space]`
			`# Fixes interior spaces`
			`text.gsub!(/ +/, ' ') if opts[:fixes_interior_spaces]`
FEATURE: normalize whitespaces in topic title/post content 2014-08-11 18:01:58 -04:00			`# Normalize whitespaces`
			`text = normalize_whitespaces(text)`
auto replace rules in titles 2013-04-10 05:00:50 -04:00			`# Strip whitespaces`
			`text.strip! if opts[:strip_whitespaces]`

			`text`
			`end`

FIX: stop stripping zero-width-whitespace This char is used for formatting khmer words 2015-03-26 22:01:31 -04:00			`@@whitespaces_regexp = Regexp.new("(\u00A0\|\u1680\|\u180E\|[\u2000-\u200A]\|\u2028\|\u2029\|\u202F\|\u205F\|\u3000)", "u").freeze`
FEATURE: normalize whitespaces in topic title/post content 2014-08-11 18:01:58 -04:00
			`def self.normalize_whitespaces(text)`
			`text.gsub(@@whitespaces_regexp, ' ')`
			`end`

auto replace rules in titles 2013-04-10 05:00:50 -04:00			`end`