mirror of
https://github.com/codeninjasllc/discourse.git
synced 2024-11-27 09:36:19 -05:00
BUGFIX: Chinese search was broken
BUGFIX: User locale was used index data BUGFIX: missing Norwegian fulltext config FEATURE: store the text used to index stuff in fulltext (for diagnostics / in page search) FEATURE: re-index posts when locale changes (in bg job) FEATURE: allow reindexing by trucating post_search_data Note: I removed japanese specific config cause it requires custom pg config, happy to add it once our base docker config ships with it
This commit is contained in:
parent
51ff644409
commit
3c84876660
8 changed files with 134 additions and 16 deletions
2
Gemfile
2
Gemfile
|
@ -223,6 +223,8 @@ gem 'gctools', require: false, platform: :mri_21
|
||||||
gem 'stackprof', require: false, platform: :mri_21
|
gem 'stackprof', require: false, platform: :mri_21
|
||||||
gem 'memory_profiler', require: false, platform: :mri_21
|
gem 'memory_profiler', require: false, platform: :mri_21
|
||||||
|
|
||||||
|
gem 'rmmseg-cpp', require: false
|
||||||
|
|
||||||
# This silly path comment just makes it easier for me to do dev
|
# This silly path comment just makes it easier for me to do dev
|
||||||
# will be removed in a few weeks
|
# will be removed in a few weeks
|
||||||
gem 'logster'#, path: '../logster'
|
gem 'logster'#, path: '../logster'
|
||||||
|
|
|
@ -283,6 +283,7 @@ GEM
|
||||||
rest-client (1.6.7)
|
rest-client (1.6.7)
|
||||||
mime-types (>= 1.16)
|
mime-types (>= 1.16)
|
||||||
rinku (1.7.3)
|
rinku (1.7.3)
|
||||||
|
rmmseg-cpp (0.2.9)
|
||||||
rspec (2.14.1)
|
rspec (2.14.1)
|
||||||
rspec-core (~> 2.14.0)
|
rspec-core (~> 2.14.0)
|
||||||
rspec-expectations (~> 2.14.0)
|
rspec-expectations (~> 2.14.0)
|
||||||
|
@ -462,6 +463,7 @@ DEPENDENCIES
|
||||||
redis
|
redis
|
||||||
rest-client
|
rest-client
|
||||||
rinku
|
rinku
|
||||||
|
rmmseg-cpp
|
||||||
rspec-given
|
rspec-given
|
||||||
rspec-rails
|
rspec-rails
|
||||||
ruby-readability
|
ruby-readability
|
||||||
|
|
|
@ -31,6 +31,8 @@ module Jobs
|
||||||
unless UserAvatar.where("last_gravatar_download_attempt IS NULL").limit(1).first
|
unless UserAvatar.where("last_gravatar_download_attempt IS NULL").limit(1).first
|
||||||
Post.rebake_old(250)
|
Post.rebake_old(250)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
Search.rebuild_problem_posts
|
||||||
end
|
end
|
||||||
|
|
||||||
end
|
end
|
||||||
|
|
|
@ -8,6 +8,8 @@ class SearchObserver < ActiveRecord::Observer
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.update_index(table, id, search_data)
|
def self.update_index(table, id, search_data)
|
||||||
|
search_data = Search.prepare_data(search_data)
|
||||||
|
|
||||||
table_name = "#{table}_search_data"
|
table_name = "#{table}_search_data"
|
||||||
foreign_key = "#{table}_id"
|
foreign_key = "#{table}_id"
|
||||||
|
|
||||||
|
@ -16,9 +18,18 @@ class SearchObserver < ActiveRecord::Observer
|
||||||
|
|
||||||
# Would be nice to use AR here but not sure how to execut Postgres functions
|
# Would be nice to use AR here but not sure how to execut Postgres functions
|
||||||
# when inserting data like this.
|
# when inserting data like this.
|
||||||
rows = Post.exec_sql_row_count("UPDATE #{table_name} SET search_data = TO_TSVECTOR('#{stemmer}', ?) WHERE #{foreign_key} = ?", search_data, id)
|
rows = Post.exec_sql_row_count("UPDATE #{table_name}
|
||||||
|
SET
|
||||||
|
raw_data = :search_data,
|
||||||
|
locale = :locale,
|
||||||
|
search_data = TO_TSVECTOR('#{stemmer}', :search_data)
|
||||||
|
WHERE #{foreign_key} = :id",
|
||||||
|
search_data: search_data, id: id, locale: SiteSetting.default_locale)
|
||||||
if rows == 0
|
if rows == 0
|
||||||
Post.exec_sql("INSERT INTO #{table_name} (#{foreign_key}, search_data) VALUES (?, TO_TSVECTOR('#{stemmer}', ?))", id, search_data)
|
Post.exec_sql("INSERT INTO #{table_name}
|
||||||
|
(#{foreign_key}, search_data, locale, raw_data)
|
||||||
|
VALUES (:id, TO_TSVECTOR('#{stemmer}', :search_data), :locale, :search_data)",
|
||||||
|
search_data: search_data, id: id, locale: SiteSetting.default_locale)
|
||||||
end
|
end
|
||||||
rescue
|
rescue
|
||||||
# don't allow concurrency to mess up saving a post
|
# don't allow concurrency to mess up saving a post
|
||||||
|
@ -39,7 +50,7 @@ class SearchObserver < ActiveRecord::Observer
|
||||||
update_index('category', category_id, name)
|
update_index('category', category_id, name)
|
||||||
end
|
end
|
||||||
|
|
||||||
def after_save(obj)
|
def self.index(obj)
|
||||||
if obj.class == Post && obj.cooked_changed?
|
if obj.class == Post && obj.cooked_changed?
|
||||||
if obj.topic
|
if obj.topic
|
||||||
category_name = obj.topic.category.name if obj.topic.category
|
category_name = obj.topic.category.name if obj.topic.category
|
||||||
|
@ -67,6 +78,10 @@ class SearchObserver < ActiveRecord::Observer
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def after_save(object)
|
||||||
|
SearchObserver.index(object)
|
||||||
|
end
|
||||||
|
|
||||||
|
|
||||||
class HtmlScrubber < Nokogiri::XML::SAX::Document
|
class HtmlScrubber < Nokogiri::XML::SAX::Document
|
||||||
attr_reader :scrubbed
|
attr_reader :scrubbed
|
||||||
|
|
11
db/migrate/20140624044600_add_raw_data_to_search.rb
Normal file
11
db/migrate/20140624044600_add_raw_data_to_search.rb
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
class AddRawDataToSearch < ActiveRecord::Migration
|
||||||
|
def change
|
||||||
|
add_column :post_search_data, :raw_data, :text
|
||||||
|
add_column :user_search_data, :raw_data, :text
|
||||||
|
add_column :category_search_data, :raw_data, :text
|
||||||
|
|
||||||
|
add_column :post_search_data, :locale, :string
|
||||||
|
add_column :user_search_data, :locale, :text
|
||||||
|
add_column :category_search_data, :locale, :text
|
||||||
|
end
|
||||||
|
end
|
|
@ -19,25 +19,67 @@ class Search
|
||||||
end
|
end
|
||||||
|
|
||||||
def self.long_locale
|
def self.long_locale
|
||||||
case I18n.locale # Currently-present in /conf/locales/* only, sorry :-( Add as needed
|
# if adding a language see:
|
||||||
when :da then 'danish'
|
# /usr/share/postgresql/9.3/tsearch_data for possible options
|
||||||
when :de then 'german'
|
# Do not add languages that are missing without amending the
|
||||||
when :en then 'english'
|
# base docker config
|
||||||
when :es then 'spanish'
|
#
|
||||||
when :fr then 'french'
|
case SiteSetting.default_locale.to_sym
|
||||||
when :it then 'italian'
|
when :da then 'danish'
|
||||||
when :ja then 'japanese'
|
when :de then 'german'
|
||||||
when :nl then 'dutch'
|
when :en then 'english'
|
||||||
when :pt then 'portuguese'
|
when :es then 'spanish'
|
||||||
when :sv then 'swedish'
|
when :fr then 'french'
|
||||||
when :ru then 'russian'
|
when :it then 'italian'
|
||||||
|
when :nl then 'dutch'
|
||||||
|
when :nb_NO then 'norwegian'
|
||||||
|
when :pt then 'portuguese'
|
||||||
|
when :pt_BR then 'portuguese'
|
||||||
|
when :sv then 'swedish'
|
||||||
|
when :ru then 'russian'
|
||||||
else 'simple' # use the 'simple' stemmer for other languages
|
else 'simple' # use the 'simple' stemmer for other languages
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def self.rebuild_problem_posts(limit = 10000)
|
||||||
|
posts = Post.joins(:topic)
|
||||||
|
.where('posts.id NOT IN (
|
||||||
|
SELECT post_id from post_search_data
|
||||||
|
WHERE locale = ?
|
||||||
|
)', SiteSetting.default_locale).limit(10000)
|
||||||
|
|
||||||
|
posts.each do |post|
|
||||||
|
post.cooked += " "
|
||||||
|
SearchObserver.index(post)
|
||||||
|
end
|
||||||
|
|
||||||
|
nil
|
||||||
|
end
|
||||||
|
|
||||||
|
def self.prepare_data(search_data)
|
||||||
|
data = search_data.squish
|
||||||
|
# TODO rmmseg is designed for chinese, we need something else for Korean / Japanese
|
||||||
|
if ['zh_TW', 'zh_CN', 'ja', 'ko'].include?(SiteSetting.default_locale)
|
||||||
|
unless defined? RMMSeg
|
||||||
|
require 'rmmseg'
|
||||||
|
RMMSeg::Dictionary.load_dictionaries
|
||||||
|
end
|
||||||
|
|
||||||
|
algo = RMMSeg::Algorithm.new(search_data)
|
||||||
|
|
||||||
|
data = ""
|
||||||
|
while token = algo.next_token
|
||||||
|
data << token.text << " "
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
data.force_encoding("UTF-8")
|
||||||
|
data
|
||||||
|
end
|
||||||
|
|
||||||
def initialize(term, opts=nil)
|
def initialize(term, opts=nil)
|
||||||
if term.present?
|
if term.present?
|
||||||
@term = term.to_s
|
@term = Search.prepare_data(term.to_s)
|
||||||
@original_term = PG::Connection.escape_string(@term)
|
@original_term = PG::Connection.escape_string(@term)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -77,6 +77,10 @@ class SqlBuilder
|
||||||
16 => :value_to_boolean
|
16 => :value_to_boolean
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def self.map_exec(klass, sql, args = {})
|
||||||
|
self.new(sql).map_exec(klass, args)
|
||||||
|
end
|
||||||
|
|
||||||
def map_exec(klass = OpenStruct, args = {})
|
def map_exec(klass = OpenStruct, args = {})
|
||||||
results = exec(args)
|
results = exec(args)
|
||||||
|
|
||||||
|
|
40
spec/models/search_observer_spec.rb
Normal file
40
spec/models/search_observer_spec.rb
Normal file
|
@ -0,0 +1,40 @@
|
||||||
|
require 'spec_helper'
|
||||||
|
|
||||||
|
describe SearchObserver do
|
||||||
|
|
||||||
|
def get_row(post_id)
|
||||||
|
SqlBuilder.map_exec(
|
||||||
|
OpenStruct,
|
||||||
|
"select * from post_search_data where post_id = :post_id",
|
||||||
|
post_id: post_id
|
||||||
|
).first
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'correctly indexes chinese' do
|
||||||
|
SiteSetting.default_locale = 'zh_CN'
|
||||||
|
data = "你好世界"
|
||||||
|
data.split(" ").length.should == 1
|
||||||
|
|
||||||
|
SearchObserver.update_posts_index(99, "你好世界", "", nil)
|
||||||
|
|
||||||
|
row = get_row(99)
|
||||||
|
row.raw_data.split(' ').length.should == 2
|
||||||
|
end
|
||||||
|
|
||||||
|
it 'correctly indexes a post' do
|
||||||
|
data = "<a>This</a> is a test"
|
||||||
|
|
||||||
|
SearchObserver.update_posts_index(99, data, "", nil)
|
||||||
|
|
||||||
|
row = get_row(99)
|
||||||
|
|
||||||
|
row.raw_data.should == "This is a test"
|
||||||
|
row.locale.should == "en"
|
||||||
|
|
||||||
|
SearchObserver.update_posts_index(99, "tester", "", nil)
|
||||||
|
|
||||||
|
row = get_row(99)
|
||||||
|
|
||||||
|
row.raw_data.should == "tester"
|
||||||
|
end
|
||||||
|
end
|
Loading…
Reference in a new issue