FIX: Problem crawling amazon titles

2024-11-30 10:58:31 -05:00 · 2014-04-08 16:39:47 -04:00 · 2014-04-08 16:39:47 -04:00 · aa63868d5e
commit aa63868d5e
parent c3bc49d5d4
1 changed files with 14 additions and 1 deletions
--- a/app/jobs/regular/crawl_topic_link.rb
+++ b/app/jobs/regular/crawl_topic_link.rb
@ -29,6 +29,10 @@ module Jobs
      return if uri.blank? || uri.host.blank?
      headers = CrawlTopicLink.request_headers(uri)
      head = Excon.head(url, read_timeout: 20, headers: headers)
+
+      # If the site does not allow HEAD, just try the url
+      return uri if head.status == 405
+
      if head.status == 200
        uri = nil unless header_for(head, 'content-type') =~ /text\/html/
        return uri
@ -43,6 +47,15 @@ module Jobs
      nil
    end

+    def self.max_chunk_size(uri)
+      # Amazon leaves the title until very late. Normally it's a bad idea to make an exception for
+      # one host but amazon is a big one.
+      return 80 if uri.host =~ /amazon\.(com|ca|co.uk)$/
+
+      # Default is 10k
+      10
+    end
+
    # Fetch the beginning of a HTML document at a url
    def self.fetch_beginning(url)
      # Never crawl in test mode
@ -58,7 +71,7 @@ module Jobs
        # Using exceptions for flow control is really bad, but there really seems to
        # be no sane way to get a stream to stop reading in Excon (or Net::HTTP for
        # that matter!)
-        raise ReadEnough.new if result.size > 1024 * 10
+        raise ReadEnough.new if result.size > (CrawlTopicLink.max_chunk_size(uri) * 1024)
      end
      Excon.get(uri.to_s, response_block: streamer, read_timeout: 20, headers: CrawlTopicLink.request_headers(uri))
      result