BUGFIX: Chinese search was broken

BUGFIX: User locale was used index data
BUGFIX: missing Norwegian fulltext config
FEATURE: store the text used to index stuff in fulltext (for diagnostics / in page search)
FEATURE: re-index posts when locale changes (in bg job)
FEATURE: allow reindexing by trucating post_search_data

Note: I removed japanese specific config cause it requires custom pg config,
  happy to add it once our base docker config ships with it
This commit is contained in:
Sam 2014-06-24 17:10:56 +10:00
parent 51ff644409
commit 3c84876660
8 changed files with 134 additions and 16 deletions

View file

@ -223,6 +223,8 @@ gem 'gctools', require: false, platform: :mri_21
gem 'stackprof', require: false, platform: :mri_21
gem 'memory_profiler', require: false, platform: :mri_21
gem 'rmmseg-cpp', require: false
# This silly path comment just makes it easier for me to do dev
# will be removed in a few weeks
gem 'logster'#, path: '../logster'

View file

@ -283,6 +283,7 @@ GEM
rest-client (1.6.7)
mime-types (>= 1.16)
rinku (1.7.3)
rmmseg-cpp (0.2.9)
rspec (2.14.1)
rspec-core (~> 2.14.0)
rspec-expectations (~> 2.14.0)
@ -462,6 +463,7 @@ DEPENDENCIES
redis
rest-client
rinku
rmmseg-cpp
rspec-given
rspec-rails
ruby-readability

View file

@ -31,6 +31,8 @@ module Jobs
unless UserAvatar.where("last_gravatar_download_attempt IS NULL").limit(1).first
Post.rebake_old(250)
end
Search.rebuild_problem_posts
end
end

View file

@ -8,6 +8,8 @@ class SearchObserver < ActiveRecord::Observer
end
def self.update_index(table, id, search_data)
search_data = Search.prepare_data(search_data)
table_name = "#{table}_search_data"
foreign_key = "#{table}_id"
@ -16,9 +18,18 @@ class SearchObserver < ActiveRecord::Observer
# Would be nice to use AR here but not sure how to execut Postgres functions
# when inserting data like this.
rows = Post.exec_sql_row_count("UPDATE #{table_name} SET search_data = TO_TSVECTOR('#{stemmer}', ?) WHERE #{foreign_key} = ?", search_data, id)
rows = Post.exec_sql_row_count("UPDATE #{table_name}
SET
raw_data = :search_data,
locale = :locale,
search_data = TO_TSVECTOR('#{stemmer}', :search_data)
WHERE #{foreign_key} = :id",
search_data: search_data, id: id, locale: SiteSetting.default_locale)
if rows == 0
Post.exec_sql("INSERT INTO #{table_name} (#{foreign_key}, search_data) VALUES (?, TO_TSVECTOR('#{stemmer}', ?))", id, search_data)
Post.exec_sql("INSERT INTO #{table_name}
(#{foreign_key}, search_data, locale, raw_data)
VALUES (:id, TO_TSVECTOR('#{stemmer}', :search_data), :locale, :search_data)",
search_data: search_data, id: id, locale: SiteSetting.default_locale)
end
rescue
# don't allow concurrency to mess up saving a post
@ -39,7 +50,7 @@ class SearchObserver < ActiveRecord::Observer
update_index('category', category_id, name)
end
def after_save(obj)
def self.index(obj)
if obj.class == Post && obj.cooked_changed?
if obj.topic
category_name = obj.topic.category.name if obj.topic.category
@ -67,6 +78,10 @@ class SearchObserver < ActiveRecord::Observer
end
end
def after_save(object)
SearchObserver.index(object)
end
class HtmlScrubber < Nokogiri::XML::SAX::Document
attr_reader :scrubbed

View file

@ -0,0 +1,11 @@
class AddRawDataToSearch < ActiveRecord::Migration
def change
add_column :post_search_data, :raw_data, :text
add_column :user_search_data, :raw_data, :text
add_column :category_search_data, :raw_data, :text
add_column :post_search_data, :locale, :string
add_column :user_search_data, :locale, :text
add_column :category_search_data, :locale, :text
end
end

View file

@ -19,25 +19,67 @@ class Search
end
def self.long_locale
case I18n.locale # Currently-present in /conf/locales/* only, sorry :-( Add as needed
# if adding a language see:
# /usr/share/postgresql/9.3/tsearch_data for possible options
# Do not add languages that are missing without amending the
# base docker config
#
case SiteSetting.default_locale.to_sym
when :da then 'danish'
when :de then 'german'
when :en then 'english'
when :es then 'spanish'
when :fr then 'french'
when :it then 'italian'
when :ja then 'japanese'
when :nl then 'dutch'
when :nb_NO then 'norwegian'
when :pt then 'portuguese'
when :pt_BR then 'portuguese'
when :sv then 'swedish'
when :ru then 'russian'
else 'simple' # use the 'simple' stemmer for other languages
end
end
def self.rebuild_problem_posts(limit = 10000)
posts = Post.joins(:topic)
.where('posts.id NOT IN (
SELECT post_id from post_search_data
WHERE locale = ?
)', SiteSetting.default_locale).limit(10000)
posts.each do |post|
post.cooked += " "
SearchObserver.index(post)
end
nil
end
def self.prepare_data(search_data)
data = search_data.squish
# TODO rmmseg is designed for chinese, we need something else for Korean / Japanese
if ['zh_TW', 'zh_CN', 'ja', 'ko'].include?(SiteSetting.default_locale)
unless defined? RMMSeg
require 'rmmseg'
RMMSeg::Dictionary.load_dictionaries
end
algo = RMMSeg::Algorithm.new(search_data)
data = ""
while token = algo.next_token
data << token.text << " "
end
end
data.force_encoding("UTF-8")
data
end
def initialize(term, opts=nil)
if term.present?
@term = term.to_s
@term = Search.prepare_data(term.to_s)
@original_term = PG::Connection.escape_string(@term)
end

View file

@ -77,6 +77,10 @@ class SqlBuilder
16 => :value_to_boolean
}
def self.map_exec(klass, sql, args = {})
self.new(sql).map_exec(klass, args)
end
def map_exec(klass = OpenStruct, args = {})
results = exec(args)

View file

@ -0,0 +1,40 @@
require 'spec_helper'
describe SearchObserver do
def get_row(post_id)
SqlBuilder.map_exec(
OpenStruct,
"select * from post_search_data where post_id = :post_id",
post_id: post_id
).first
end
it 'correctly indexes chinese' do
SiteSetting.default_locale = 'zh_CN'
data = "你好世界"
data.split(" ").length.should == 1
SearchObserver.update_posts_index(99, "你好世界", "", nil)
row = get_row(99)
row.raw_data.split(' ').length.should == 2
end
it 'correctly indexes a post' do
data = "<a>This</a> is a test"
SearchObserver.update_posts_index(99, data, "", nil)
row = get_row(99)
row.raw_data.should == "This is a test"
row.locale.should == "en"
SearchObserver.update_posts_index(99, "tester", "", nil)
row = get_row(99)
row.raw_data.should == "tester"
end
end