diff --git a/lib/text_cleaner.rb b/lib/text_cleaner.rb index 6d621ec9f..bf10cdbc5 100644 --- a/lib/text_cleaner.rb +++ b/lib/text_cleaner.rb @@ -1,6 +1,10 @@ # # Clean up a text # + +# Whe use ActiveSupport mb_chars from here to properly support non ascii downcase +require 'active_support/core_ext/string/multibyte' + class TextCleaner def self.title_options @@ -27,9 +31,12 @@ class TextCleaner # Replace ????? with a single ? text.gsub!(/\?+/, '?') if opts[:deduplicate_question_marks] # Replace all-caps text with regular case letters - text.tr!('A-Z', 'a-z') if opts[:replace_all_upper_case] && (text =~ /[A-Z]+/) && (text == text.upcase) + text = text.mb_chars.downcase.to_s if opts[:replace_all_upper_case] && (text =~ /[A-Z]+/) && (text == text.upcase) # Capitalize first letter, but only when entire first word is lowercase - text.sub!(/\A([a-z]*)\b/) { |first| first.capitalize } if opts[:capitalize_first_letter] + first, rest = text.split(' ', 2) + if first && opts[:capitalize_first_letter] && first == first.mb_chars.downcase + text = "#{first.mb_chars.capitalize}#{rest ? ' ' + rest : ''}" + end # Remove unnecessary periods at the end text.sub!(/([^.])\.+(\s*)\z/, '\1\2') if opts[:remove_all_periods_from_the_end] # Remove extraneous space before the end punctuation diff --git a/spec/components/text_cleaner_spec.rb b/spec/components/text_cleaner_spec.rb index cf203c709..a41a1e7c1 100644 --- a/spec/components/text_cleaner_spec.rb +++ b/spec/components/text_cleaner_spec.rb @@ -187,6 +187,14 @@ describe TextCleaner do expect(TextCleaner.clean_title("Hello there ?")).to eq("Hello there?") end + it "replaces all upper case unicode text with regular unicode case letters" do + expect(TextCleaner.clean_title("INVESTIGAÇÃO POLÍTICA NA CÂMARA")).to eq("Investigação política na câmara") + end + + it "capitalizes first unicode letter" do + expect(TextCleaner.clean_title("épico encontro")).to eq("Épico encontro") + end + end end