From e77e7f23cac1006e0520aebb411e9d7ce9fab8aa Mon Sep 17 00:00:00 2001 From: Camille Roux <contact@camilleroux.com> Date: Tue, 6 May 2014 18:36:07 +0200 Subject: [PATCH 1/2] Update the Amazon links regexp Added all the countries displayed in the Amazon footer --- app/jobs/regular/crawl_topic_link.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/jobs/regular/crawl_topic_link.rb b/app/jobs/regular/crawl_topic_link.rb index c8273473d..2dbe41a49 100644 --- a/app/jobs/regular/crawl_topic_link.rb +++ b/app/jobs/regular/crawl_topic_link.rb @@ -50,7 +50,7 @@ module Jobs def self.max_chunk_size(uri) # Amazon leaves the title until very late. Normally it's a bad idea to make an exception for # one host but amazon is a big one. - return 80 if uri.host =~ /amazon\.(com|ca|co.uk)$/ + return 80 if uri.host =~ /amazon\.(com|ca|co.uk|es|fr|de|it|com.au|com.br|cn|in|co.jp|com.mx)$/ # Default is 10k 10 From f14c71b9d4084b3a2f5c29823dd71e8c76db09bd Mon Sep 17 00:00:00 2001 From: Camille Roux <contact@camilleroux.com> Date: Tue, 6 May 2014 19:19:32 +0200 Subject: [PATCH 2/2] Fix the Amazon links regex --- app/jobs/regular/crawl_topic_link.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/jobs/regular/crawl_topic_link.rb b/app/jobs/regular/crawl_topic_link.rb index 2dbe41a49..63497c5ba 100644 --- a/app/jobs/regular/crawl_topic_link.rb +++ b/app/jobs/regular/crawl_topic_link.rb @@ -50,7 +50,7 @@ module Jobs def self.max_chunk_size(uri) # Amazon leaves the title until very late. Normally it's a bad idea to make an exception for # one host but amazon is a big one. - return 80 if uri.host =~ /amazon\.(com|ca|co.uk|es|fr|de|it|com.au|com.br|cn|in|co.jp|com.mx)$/ + return 80 if uri.host =~ /amazon\.(com|ca|co\.uk|es|fr|de|it|com\.au|com\.br|cn|in|co\.jp|com\.mx)$/ # Default is 10k 10