Introduction of TextSentinel to enforce title and body quality.

This commit is contained in:
Robin Ward 2013-02-06 20:09:31 -05:00
parent 94e58d733e
commit 40da901e5d
12 changed files with 220 additions and 32 deletions

View file

@ -38,6 +38,7 @@ class Post < ActiveRecord::Base
validates_presence_of :raw, :user_id, :topic_id validates_presence_of :raw, :user_id, :topic_id
validates :raw, length: {in: SiteSetting.min_post_length..SiteSetting.max_post_length} validates :raw, length: {in: SiteSetting.min_post_length..SiteSetting.max_post_length}
validate :raw_quality
validate :max_mention_validator validate :max_mention_validator
validate :max_images_validator validate :max_images_validator
validate :max_links_validator validate :max_links_validator
@ -68,6 +69,18 @@ class Post < ActiveRecord::Base
self.raw.strip! if self.raw.present? self.raw.strip! if self.raw.present?
end end
def raw_quality
sentinel = TextSentinel.new(self.raw, min_entropy: SiteSetting.body_min_entropy)
if sentinel.valid?
# It's possible the sentinel has cleaned up the title a bit
self.raw = sentinel.text
else
errors.add(:raw, I18n.t(:is_invalid)) unless sentinel.valid?
end
end
# Stop us from posting the same thing too quickly # Stop us from posting the same thing too quickly
def unique_post_validator def unique_post_validator
return if SiteSetting.unique_posts_mins == 0 return if SiteSetting.unique_posts_mins == 0

View file

@ -126,6 +126,11 @@ class SiteSetting < ActiveRecord::Base
setting(:basic_requires_read_posts, 100) setting(:basic_requires_read_posts, 100)
setting(:basic_requires_time_spent_mins, 30) setting(:basic_requires_time_spent_mins, 30)
# Entropy checks
setting(:title_min_entropy, 10)
setting(:body_min_entropy, 7)
setting(:max_word_length, 30)
def self.call_mothership? def self.call_mothership?
self.enforce_global_nicknames? and self.discourse_org_access_key.present? self.enforce_global_nicknames? and self.discourse_org_access_key.present?

View file

@ -2,6 +2,7 @@ require_dependency 'slug'
require_dependency 'avatar_lookup' require_dependency 'avatar_lookup'
require_dependency 'topic_view' require_dependency 'topic_view'
require_dependency 'rate_limiter' require_dependency 'rate_limiter'
require_dependency 'text_sentinel'
class Topic < ActiveRecord::Base class Topic < ActiveRecord::Base
include RateLimiter::OnCreateRecord include RateLimiter::OnCreateRecord
@ -18,13 +19,14 @@ class Topic < ActiveRecord::Base
rate_limit :limit_topics_per_day rate_limit :limit_topics_per_day
rate_limit :limit_private_messages_per_day rate_limit :limit_private_messages_per_day
validate :title_quality
validates_presence_of :title validates_presence_of :title
validates :title, length: {in: SiteSetting.min_topic_title_length..SiteSetting.max_topic_title_length} validates :title, length: {in: SiteSetting.min_topic_title_length..SiteSetting.max_topic_title_length}
serialize :meta_data, ActiveRecord::Coders::Hstore serialize :meta_data, ActiveRecord::Coders::Hstore
validate :unique_title validate :unique_title
validate :nuclear_option
belongs_to :category belongs_to :category
has_many :posts has_many :posts
@ -113,18 +115,20 @@ class Topic < ActiveRecord::Base
errors.add(:title, I18n.t(:has_already_been_used)) if finder.exists? errors.add(:title, I18n.t(:has_already_been_used)) if finder.exists?
end end
# This is bad, but people are screwing us on try.discourse.org - soon we'll replace with
# a much more sane validation of odd characters to allow for other languages and such.
def nuclear_option
# Let presence validation catch it if it's blank def title_quality
return if title.blank? # We don't care about quality on private messages
return if private_message?
title.each_char do |c| sentinel = TextSentinel.new(title,
unless (20..126).include?(c.ord) min_entropy: SiteSetting.title_min_entropy,
errors.add(:title, I18n.t(:invalid_characters)) max_word_length: SiteSetting.max_word_length,
return remove_interior_spaces: true)
end if sentinel.valid?
# It's possible the sentinel has cleaned up the title a bit
self.title = sentinel.text
else
errors.add(:title, I18n.t(:is_invalid)) unless sentinel.valid?
end end
end end

View file

@ -9,6 +9,7 @@ en:
just_posted_that: "is too similar to what you recently posted" just_posted_that: "is too similar to what you recently posted"
has_already_been_used: "has already been used" has_already_been_used: "has already been used"
invalid_characters: "contains invalid characters" invalid_characters: "contains invalid characters"
is_invalid: "is invalid; try to be a little more descriptive"
activerecord: activerecord:
attributes: attributes:
@ -302,6 +303,10 @@ en:
email_time_window_mins: "How many minutes we wait before sending a user mail, to give them a chance to see it first." email_time_window_mins: "How many minutes we wait before sending a user mail, to give them a chance to see it first."
flush_timings_secs: "How frequently we flush timing data to the server, in seconds." flush_timings_secs: "How frequently we flush timing data to the server, in seconds."
max_word_length: "The maximum word length in a topic title"
title_min_entropy: "The minimum entropy for a topic title"
body_min_entropy: "The minimum entropy for post body"
# This section is exported to the javascript for i18n in the admin section # This section is exported to the javascript for i18n in the admin section
admin_js: admin_js:
type_to_filter: "Type to Filter..." type_to_filter: "Type to Filter..."

56
lib/text_sentinel.rb Normal file
View file

@ -0,0 +1,56 @@
require 'iconv'
#
# Given a string, tell us whether or not is acceptable. Also, remove stuff we don't like
# such as leading / trailing space.
#
class TextSentinel
attr_accessor :text
def self.non_symbols_regexp
/[\ -\/\[-\`\:-\@\{-\~]/m
end
def initialize(text, opts=nil)
if text.present?
@text = Iconv.new('UTF-8//IGNORE', 'UTF-8').iconv(text.dup)
end
@opts = opts || {}
if @text.present?
@text.strip!
@text.gsub!(/ +/m, ' ') if @opts[:remove_interior_spaces]
end
end
# Entropy is a number of how many unique characters the string needs.
def entropy
return 0 if @text.blank?
@entropy ||= @text.each_char.to_a.uniq.size
end
def valid?
# Blank strings are not valid
return false if @text.blank?
# Entropy check if required
return false if @opts[:min_entropy].present? and (entropy < @opts[:min_entropy])
# We don't have a comprehensive list of symbols, but this will eliminate some noise
non_symbols = @text.gsub(TextSentinel.non_symbols_regexp, '').size
return false if non_symbols == 0
# Don't allow super long strings without spaces
return false if @opts[:max_word_length] and @text =~ /\w{#{@opts[:max_word_length]},}(\s|$)/
# We don't allow all upper case content
return false if @text == @text.upcase
true
end
end

View file

@ -11,7 +11,7 @@ describe PostCreator do
context 'new topic' do context 'new topic' do
let(:category) { Fabricate(:category, user: user) } let(:category) { Fabricate(:category, user: user) }
let(:basic_topic_params) { {title: 'hello world', raw: 'my name is fred', archetype_id: 1} } let(:basic_topic_params) { {title: 'hello world topic', raw: 'my name is fred', archetype_id: 1} }
let(:image_sizes) { {'http://an.image.host/image.jpg' => {'width' => 111, 'height' => 222}} } let(:image_sizes) { {'http://an.image.host/image.jpg' => {'width' => 111, 'height' => 222}} }
let(:creator) { PostCreator.new(user, basic_topic_params) } let(:creator) { PostCreator.new(user, basic_topic_params) }
@ -83,7 +83,7 @@ describe PostCreator do
let(:target_user1) { Fabricate(:coding_horror) } let(:target_user1) { Fabricate(:coding_horror) }
let(:target_user2) { Fabricate(:moderator) } let(:target_user2) { Fabricate(:moderator) }
let(:post) do let(:post) do
PostCreator.create(user, title: 'hi there', PostCreator.create(user, title: 'hi there welcome to my topic',
raw: 'this is my awesome message', raw: 'this is my awesome message',
archetype: Archetype.private_message, archetype: Archetype.private_message,
target_usernames: [target_user1.username, target_user2.username].join(',')) target_usernames: [target_user1.username, target_user2.username].join(','))

View file

@ -14,7 +14,7 @@ describe Search do
context 'post indexing observer' do context 'post indexing observer' do
before do before do
@category = Fabricate(:category, name: 'america') @category = Fabricate(:category, name: 'america')
@topic = Fabricate(:topic, title: 'sam test', category: @category) @topic = Fabricate(:topic, title: 'sam test topic', category: @category)
@post = Fabricate(:post, topic: @topic, raw: 'this <b>fun test</b> <img src="bla" title="my image">') @post = Fabricate(:post, topic: @topic, raw: 'this <b>fun test</b> <img src="bla" title="my image">')
@indexed = Topic.exec_sql("select search_data from posts_search where id = #{@post.id}").first["search_data"] @indexed = Topic.exec_sql("select search_data from posts_search where id = #{@post.id}").first["search_data"]
end end
@ -29,7 +29,7 @@ describe Search do
end end
it "should pick up on title updates" do it "should pick up on title updates" do
@topic.title = "harpi" @topic.title = "harpi is the new title"
@topic.save! @topic.save!
@indexed = Topic.exec_sql("select search_data from posts_search where id = #{@post.id}").first["search_data"] @indexed = Topic.exec_sql("select search_data from posts_search where id = #{@post.id}").first["search_data"]

View file

@ -0,0 +1,96 @@
# encoding: utf-8
require 'spec_helper'
require 'text_sentinel'
require 'iconv'
describe TextSentinel do
context "entropy" do
it "returns 0 for an empty string" do
TextSentinel.new("").entropy.should == 0
end
it "returns 0 for a nil string" do
TextSentinel.new(nil).entropy.should == 0
end
it "returns 1 for a string with many leading spaces" do
TextSentinel.new((" " * 10) + "x").entropy.should == 1
end
it "returns 1 for one char, even repeated" do
TextSentinel.new("a" * 10).entropy.should == 1
end
it "returns an accurate count of many chars" do
TextSentinel.new("evil trout is evil").entropy.should == 10
end
end
context "cleaning up" do
it "strips leading or trailing whitespace" do
TextSentinel.new(" \t test \t ").text.should == "test"
end
it "allows utf-8 chars" do
TextSentinel.new("йȝîûηыეமிᚉ⠛").text.should == "йȝîûηыეமிᚉ⠛"
end
context "interior spaces" do
let(:spacey_string) { "hello there's weird spaces here." }
it "ignores intra spaces by default" do
TextSentinel.new(spacey_string).text.should == spacey_string
end
it "fixes intra spaces when enabled" do
TextSentinel.new(spacey_string, remove_interior_spaces: true).text.should == "hello there's weird spaces here."
end
end
end
context "validity" do
let(:valid_string) { "This is a cool topic about Discourse" }
it "allows a valid string" do
TextSentinel.new(valid_string).should be_valid
end
it "doesn't allow all caps topics" do
TextSentinel.new(valid_string.upcase).should_not be_valid
end
it "enforces the minimum entropy" do
TextSentinel.new(valid_string, min_entropy: 16).should be_valid
end
it "enforces the minimum entropy" do
TextSentinel.new(valid_string, min_entropy: 17).should_not be_valid
end
it "doesn't allow a long alphanumeric string with no spaces" do
TextSentinel.new("jfewjfoejwfojeojfoejofjeo38493824jfkjewfjeoifijeoijfoejofjeojfoewjfo834988394032jfiejoijofijeojfeojfojeofjewojfojeofjeowjfojeofjeojfoe3898439849032jfeijfwoijfoiewj",
max_word_length: 30).should_not be_valid
end
it "doesn't except junk symbols as a string" do
TextSentinel.new("[[[").should_not be_valid
TextSentinel.new("<<<").should_not be_valid
TextSentinel.new("{{$!").should_not be_valid
end
end
end

View file

@ -11,11 +11,11 @@ describe TopicQuery do
let(:admin) { Fabricate(:moderator) } let(:admin) { Fabricate(:moderator) }
context 'a bunch of topics' do context 'a bunch of topics' do
let!(:regular_topic) { Fabricate(:topic, title: 'regular', user: creator, bumped_at: 15.minutes.ago) } let!(:regular_topic) { Fabricate(:topic, title: 'this is a regular topic', user: creator, bumped_at: 15.minutes.ago) }
let!(:pinned_topic) { Fabricate(:topic, title: 'pinned', user: creator, pinned: true, bumped_at: 10.minutes.ago) } let!(:pinned_topic) { Fabricate(:topic, title: 'this is a pinned topic', user: creator, pinned: true, bumped_at: 10.minutes.ago) }
let!(:archived_topic) { Fabricate(:topic, title: 'archived', user: creator, archived: true, bumped_at: 6.minutes.ago) } let!(:archived_topic) { Fabricate(:topic, title: 'this is an archived topic', user: creator, archived: true, bumped_at: 6.minutes.ago) }
let!(:invisible_topic) { Fabricate(:topic, title: 'invisible', user: creator, visible: false, bumped_at: 5.minutes.ago) } let!(:invisible_topic) { Fabricate(:topic, title: 'this is an invisible topic', user: creator, visible: false, bumped_at: 5.minutes.ago) }
let!(:closed_topic) { Fabricate(:topic, title: 'closed', user: creator, closed: true, bumped_at: 1.minute.ago) } let!(:closed_topic) { Fabricate(:topic, title: 'this is a closed topic', user: creator, closed: true, bumped_at: 1.minute.ago) }
context 'list_popular' do context 'list_popular' do
let(:topics) { topic_query.list_popular.topics } let(:topics) { topic_query.list_popular.topics }

View file

@ -334,9 +334,9 @@ describe TopicsController do
end end
it 'allows a change of title' do it 'allows a change of title' do
xhr :put, :update, topic_id: @topic.id, slug: @topic.title, title: 'new title' xhr :put, :update, topic_id: @topic.id, slug: @topic.title, title: 'this is a new title for the topic'
@topic.reload @topic.reload
@topic.title.should == 'new title' @topic.title.should == 'this is a new title for the topic'
end end
it 'triggers a change of category' do it 'triggers a change of category' do

View file

@ -30,7 +30,7 @@ describe PostAlertObserver do
context 'when editing a post' do context 'when editing a post' do
it 'notifies a user of the revision' do it 'notifies a user of the revision' do
lambda { lambda {
post.revise(evil_trout, "world") post.revise(evil_trout, "world is the new body of the message")
}.should change(post.user.notifications, :count).by(1) }.should change(post.user.notifications, :count).by(1)
end end
end end

View file

@ -5,9 +5,6 @@ require 'spec_helper'
describe Topic do describe Topic do
it { should validate_presence_of :title } it { should validate_presence_of :title }
it { should_not allow_value("x" * (SiteSetting.max_topic_title_length + 1)).for(:title) }
it { should_not allow_value("x").for(:title) }
it { should_not allow_value((" " * SiteSetting.min_topic_title_length) + "x").for(:title) }
it { should belong_to :category } it { should belong_to :category }
it { should belong_to :user } it { should belong_to :user }
@ -26,14 +23,26 @@ describe Topic do
it { should rate_limit } it { should rate_limit }
context 'topic title content' do context '.title_quality' do
it "strips a title when identifying length" do
Fabricate.build(:topic, title: (" " * SiteSetting.min_topic_title_length) + "x").should_not be_valid
end
it "doesn't allow a long title" do
Fabricate.build(:topic, title: "x" * (SiteSetting.max_topic_title_length + 1)).should_not be_valid
end
it "doesn't allow a short title" do
Fabricate.build(:topic, title: "x" * (SiteSetting.min_topic_title_length + 1)).should_not be_valid
end
it "allows a regular title with a few ascii characters" do it "allows a regular title with a few ascii characters" do
Fabricate.build(:topic, title: "hello this is my cool topic! welcome: all;").should be_valid Fabricate.build(:topic, title: "hello this is my cool topic! welcome: all;").should be_valid
end end
it "doesn't allow non standard ascii" do it "allows non ascii" do
Fabricate.build(:topic, title: "Iñtërnâtiônàlizætiøn").should_not be_valid Fabricate.build(:topic, title: "Iñtërnâtiônàlizætiøn").should be_valid
end end
end end
@ -830,7 +839,7 @@ describe Topic do
context 'changing title' do context 'changing title' do
before do before do
topic.title = "new title" topic.title = "new title for the topic"
topic.save topic.save
end end