diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index 5f9260e2e..47f225f6e 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -590,8 +590,8 @@ en: email_time_window_mins: "How many minutes we wait before sending a user mail, to give them a chance to see it first" flush_timings_secs: "How frequently we flush timing data to the server, in seconds" max_word_length: "The maximum allowed word length, in characters, in a topic title" - title_min_entropy: "The minimum allowed entropy (unique characters) required for a topic title" - body_min_entropy: "The minimum allowed entropy (unique characters) required for a post body" + title_min_entropy: "The minimum allowed entropy (unique characters, non-english count for more) required for a topic title" + body_min_entropy: "The minimum allowed entropy (unique characters, non-english count for more) required for a post body" title_fancy_entities: "Convert common ASCII characters to fancy HTML entities in topic titles, ala SmartyPants http://daringfireball.net/projects/smartypants/" diff --git a/lib/text_sentinel.rb b/lib/text_sentinel.rb index 230791e62..cb0d36393 100644 --- a/lib/text_sentinel.rb +++ b/lib/text_sentinel.rb @@ -21,8 +21,10 @@ class TextSentinel end # Entropy is a number of how many unique characters the string needs. + # Non-ASCII characters are weighted heavier since they contain more "information" def entropy - @entropy ||= @text.to_s.strip.split('').uniq.size + chars = @text.to_s.strip.split('') + @entropy ||= chars.pack('M*'*chars.size).gsub("\n",'').split('=').uniq.size end def valid? diff --git a/spec/components/text_sentinel_spec.rb b/spec/components/text_sentinel_spec.rb index cf9e211ef..669dc837f 100644 --- a/spec/components/text_sentinel_spec.rb +++ b/spec/components/text_sentinel_spec.rb @@ -32,7 +32,15 @@ describe TextSentinel do end it "Works on foreign characters" do - TextSentinel.new("去年十社會警告").entropy.should == 7 + TextSentinel.new("去年十社會警告").entropy.should == 19 + end + + it "generates enough entropy for short foreign strings" do + TextSentinel.new("又一个测").entropy.should == 11 + end + + it "handles repeated foreign characters" do + TextSentinel.new("又一个测试话题" * 3).entropy.should == 18 end end