mirror of
https://github.com/codeninjasllc/discourse.git
synced 2024-11-27 09:36:19 -05:00
Special case: When crawling a link to an image, just put the filename as
the title.
This commit is contained in:
parent
99e2bab62d
commit
e80851b0fa
1 changed files with 20 additions and 9 deletions
|
@ -89,16 +89,27 @@ module Jobs
|
|||
|
||||
crawled = false
|
||||
|
||||
result = CrawlTopicLink.fetch_beginning(topic_link.url)
|
||||
doc = Nokogiri::HTML(result)
|
||||
if doc
|
||||
title = doc.at('title').try(:inner_text)
|
||||
if title.present?
|
||||
title.gsub!(/\n/, ' ')
|
||||
title.gsub!(/ +/, ' ')
|
||||
title.strip!
|
||||
# Special case: Images
|
||||
# If the link is to an image, put the filename as the title
|
||||
if topic_link.url =~ /\.(jpg|gif|png)$/
|
||||
uri = URI(topic_link.url)
|
||||
filename = File.basename(uri.path)
|
||||
crawled = (TopicLink.where(id: topic_link.id).update_all(["title = ?, crawled_at = CURRENT_TIMESTAMP", filename]) == 1)
|
||||
end
|
||||
|
||||
unless crawled
|
||||
# Fetch the beginning of the document to find the title
|
||||
result = CrawlTopicLink.fetch_beginning(topic_link.url)
|
||||
doc = Nokogiri::HTML(result)
|
||||
if doc
|
||||
title = doc.at('title').try(:inner_text)
|
||||
if title.present?
|
||||
crawled = (TopicLink.where(id: topic_link.id).update_all(['title = ?, crawled_at = CURRENT_TIMESTAMP', title[0..255]]) == 1)
|
||||
title.gsub!(/\n/, ' ')
|
||||
title.gsub!(/ +/, ' ')
|
||||
title.strip!
|
||||
if title.present?
|
||||
crawled = (TopicLink.where(id: topic_link.id).update_all(['title = ?, crawled_at = CURRENT_TIMESTAMP', title[0..255]]) == 1)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in a new issue