From aa63868d5e8bf1e24d24ab21f78ae5a3e6273d54 Mon Sep 17 00:00:00 2001
From: Robin Ward <robin.ward@gmail.com>
Date: Tue, 8 Apr 2014 16:39:47 -0400
Subject: [PATCH] FIX: Problem crawling amazon titles

---
 app/jobs/regular/crawl_topic_link.rb | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/app/jobs/regular/crawl_topic_link.rb b/app/jobs/regular/crawl_topic_link.rb
index 3daff59ab..c0fce0c4c 100644
--- a/app/jobs/regular/crawl_topic_link.rb
+++ b/app/jobs/regular/crawl_topic_link.rb
@@ -29,6 +29,10 @@ module Jobs
       return if uri.blank? || uri.host.blank?
       headers = CrawlTopicLink.request_headers(uri)
       head = Excon.head(url, read_timeout: 20, headers: headers)
+
+      # If the site does not allow HEAD, just try the url
+      return uri if head.status == 405
+
       if head.status == 200
         uri = nil unless header_for(head, 'content-type') =~ /text\/html/
         return uri
@@ -43,6 +47,15 @@ module Jobs
       nil
     end
 
+    def self.max_chunk_size(uri)
+      # Amazon leaves the title until very late. Normally it's a bad idea to make an exception for
+      # one host but amazon is a big one.
+      return 80 if uri.host =~ /amazon\.(com|ca|co.uk)$/
+
+      # Default is 10k
+      10
+    end
+
     # Fetch the beginning of a HTML document at a url
     def self.fetch_beginning(url)
       # Never crawl in test mode
@@ -58,7 +71,7 @@ module Jobs
         # Using exceptions for flow control is really bad, but there really seems to
         # be no sane way to get a stream to stop reading in Excon (or Net::HTTP for
         # that matter!)
-        raise ReadEnough.new if result.size > 1024 * 10
+        raise ReadEnough.new if result.size > (CrawlTopicLink.max_chunk_size(uri) * 1024)
       end
       Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: CrawlTopicLink.request_headers(uri))
       result