diff --git a/lib/text_sentinel.rb b/lib/text_sentinel.rb index 230791e62..cb0d36393 100644 --- a/lib/text_sentinel.rb +++ b/lib/text_sentinel.rb @@ -21,8 +21,10 @@ class TextSentinel end # Entropy is a number of how many unique characters the string needs. + # Non-ASCII characters are weighted heavier since they contain more "information" def entropy - @entropy ||= @text.to_s.strip.split('').uniq.size + chars = @text.to_s.strip.split('') + @entropy ||= chars.pack('M*'*chars.size).gsub("\n",'').split('=').uniq.size end def valid? diff --git a/spec/components/text_sentinel_spec.rb b/spec/components/text_sentinel_spec.rb index cf9e211ef..669dc837f 100644 --- a/spec/components/text_sentinel_spec.rb +++ b/spec/components/text_sentinel_spec.rb @@ -32,7 +32,15 @@ describe TextSentinel do end it "Works on foreign characters" do - TextSentinel.new("去年十社會警告").entropy.should == 7 + TextSentinel.new("去年十社會警告").entropy.should == 19 + end + + it "generates enough entropy for short foreign strings" do + TextSentinel.new("又一个测").entropy.should == 11 + end + + it "handles repeated foreign characters" do + TextSentinel.new("又一个测试话题" * 3).entropy.should == 18 end end