From bb77d2c38b27537f28f9ac0afa2585df7f345e7f Mon Sep 17 00:00:00 2001 From: Michael Brown Date: Fri, 7 Jun 2013 14:47:07 -0400 Subject: [PATCH 1/2] More entropy for foreign titles * Treat strings with non-ASCII characters as having more entropy --- lib/text_sentinel.rb | 4 +++- spec/components/text_sentinel_spec.rb | 10 +++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/lib/text_sentinel.rb b/lib/text_sentinel.rb index 230791e62..cb0d36393 100644 --- a/lib/text_sentinel.rb +++ b/lib/text_sentinel.rb @@ -21,8 +21,10 @@ class TextSentinel end # Entropy is a number of how many unique characters the string needs. + # Non-ASCII characters are weighted heavier since they contain more "information" def entropy - @entropy ||= @text.to_s.strip.split('').uniq.size + chars = @text.to_s.strip.split('') + @entropy ||= chars.pack('M*'*chars.size).gsub("\n",'').split('=').uniq.size end def valid? diff --git a/spec/components/text_sentinel_spec.rb b/spec/components/text_sentinel_spec.rb index cf9e211ef..669dc837f 100644 --- a/spec/components/text_sentinel_spec.rb +++ b/spec/components/text_sentinel_spec.rb @@ -32,7 +32,15 @@ describe TextSentinel do end it "Works on foreign characters" do - TextSentinel.new("去年十社會警告").entropy.should == 7 + TextSentinel.new("去年十社會警告").entropy.should == 19 + end + + it "generates enough entropy for short foreign strings" do + TextSentinel.new("又一个测").entropy.should == 11 + end + + it "handles repeated foreign characters" do + TextSentinel.new("又一个测试话题" * 3).entropy.should == 18 end end From f3fb10de03482daee927a8db76b0eb2480a2f47e Mon Sep 17 00:00:00 2001 From: Michael Brown Date: Fri, 7 Jun 2013 15:07:43 -0400 Subject: [PATCH 2/2] settings: Update english description of entropy --- config/locales/server.en.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/locales/server.en.yml b/config/locales/server.en.yml index fbdd34730..d84d83729 100644 --- a/config/locales/server.en.yml +++ b/config/locales/server.en.yml @@ -589,8 +589,8 @@ en: email_time_window_mins: "How many minutes we wait before sending a user mail, to give them a chance to see it first" flush_timings_secs: "How frequently we flush timing data to the server, in seconds" max_word_length: "The maximum allowed word length, in characters, in a topic title" - title_min_entropy: "The minimum allowed entropy (unique characters) required for a topic title" - body_min_entropy: "The minimum allowed entropy (unique characters) required for a post body" + title_min_entropy: "The minimum allowed entropy (unique characters, non-english count for more) required for a topic title" + body_min_entropy: "The minimum allowed entropy (unique characters, non-english count for more) required for a post body" title_fancy_entities: "Convert common ASCII characters to fancy HTML entities in topic titles, ala SmartyPants http://daringfireball.net/projects/smartypants/"