diff --git a/Gemfile b/Gemfile index 10f209d3d..da9298c3d 100644 --- a/Gemfile +++ b/Gemfile @@ -47,8 +47,6 @@ gem 'aws-sdk', require: false gem 'excon', require: false gem 'unf', require: false -gem 'email_reply_parser' - # note: for image_optim to correctly work you need to follow # https://github.com/toy/image_optim # pinned due to https://github.com/toy/image_optim/pull/75, docker image must be upgraded to upgrade diff --git a/Gemfile.lock b/Gemfile.lock index 4b249ed10..154d70600 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -115,7 +115,6 @@ GEM railties docile (1.1.5) dotenv (2.0.2) - email_reply_parser (0.5.8) ember-data-source (1.0.0.beta.16.1) ember-source (~> 1.8) ember-handlebars-template (0.1.5) @@ -451,7 +450,6 @@ DEPENDENCIES byebug certified discourse-qunit-rails - email_reply_parser ember-rails ember-source (= 1.12.1) excon diff --git a/lib/email/email_reply_parser.rb b/lib/email/email_reply_parser.rb new file mode 100644 index 000000000..14282cee5 --- /dev/null +++ b/lib/email/email_reply_parser.rb @@ -0,0 +1,278 @@ +require 'strscan' + +# https://github.com/github/email_reply_parser/blob/master/lib/email_reply_parser.rb +# +# EmailReplyParser is a small library to parse plain text email content. The +# goal is to identify which fragments are quoted, part of a signature, or +# original body content. We want to support both top and bottom posters, so +# no simple "REPLY ABOVE HERE" content is used. +# +# Beyond RFC 5322 (which is handled by the [Ruby mail gem][mail]), there aren't +# any real standards for how emails are created. This attempts to parse out +# common conventions for things like replies: +# +# this is some text +# +# On , wrote: +# > blah blah +# > blah blah +# +# ... and signatures: +# +# this is some text +# +# -- +# Bob +# http://homepage.com/~bob +# +# Each of these are parsed into Fragment objects. +# +# EmailReplyParser also attempts to figure out which of these blocks should +# be hidden from users. +# +# [mail]: https://github.com/mikel/mail +class EmailReplyParser + + # Public: Splits an email body into a list of Fragments. + # + # text - A String email body. + # + # Returns an Email instance. + def self.read(text) + Email.new.read(text) + end + + # Public: Get the text of the visible portions of the given email body. + # + # text - A String email body. + # + # Returns a String. + def self.parse_reply(text) + self.read(text).visible_text + end + + ### Emails + + # An Email instance represents a parsed body String. + class Email + # Emails have an Array of Fragments. + attr_reader :fragments + + def initialize + @fragments = [] + end + + # Public: Gets the combined text of the visible fragments of the email body. + # + # Returns a String. + def visible_text + fragments.select{|f| !f.hidden?}.map{|f| f.to_s}.join("\n").rstrip + end + + # Splits the given text into a list of Fragments. This is roughly done by + # reversing the text and parsing from the bottom to the top. This way we + # can check for 'On , wrote:' lines above quoted blocks. + # + # text - A String email body. + # + # Returns this same Email instance. + def read(text) + # in 1.9 we want to operate on the raw bytes + text = text.dup.force_encoding('binary') if text.respond_to?(:force_encoding) + + # Normalize line endings. + text.gsub!("\r\n", "\n") + + # Check for "On DATE, NAME wrote:" + # or "---- Original Message ----" and strip + # email content after that part + if text =~ /^(On\s.+wrote:.*)$/nm || text =~ /^([\s_-]+Original (?i)message?[\s_-]+$.*)/nm + text.gsub!($1, "") + end + + # Some users may reply directly above a line of underscores. + # In order to ensure that these fragments are split correctly, + # make sure that all lines of underscores are preceded by + # at least two newline characters. + text.gsub!(/([^\n])(?=\n_{7}_+)$/m, "\\1\n") + + # The text is reversed initially due to the way we check for hidden + # fragments. + text = text.reverse + + # This determines if any 'visible' Fragment has been found. Once any + # visible Fragment is found, stop looking for hidden ones. + @found_visible = false + + # This instance variable points to the current Fragment. If the matched + # line fits, it should be added to this Fragment. Otherwise, finish it + # and start a new Fragment. + @fragment = nil + + # Use the StringScanner to pull out each line of the email content. + @scanner = StringScanner.new(text) + while line = @scanner.scan_until(/\n/n) + scan_line(line) + end + + # Be sure to parse the last line of the email. + if (last_line = @scanner.rest.to_s).size > 0 + scan_line(last_line) + end + + # Finish up the final fragment. Finishing a fragment will detect any + # attributes (hidden, signature, reply), and join each line into a + # string. + finish_fragment + + @scanner = @fragment = nil + + # Now that parsing is done, reverse the order. + @fragments.reverse! + self + end + + private + EMPTY = "".freeze + SIGNATURE = '(?m)(--\s*$|__\s*$)|(^(\w+\s*){1,3} ym morf tneS$)' + + begin + require 're2' + SIG_REGEX = RE2::Regexp.new(SIGNATURE) + rescue LoadError + SIG_REGEX = Regexp.new(SIGNATURE) + end + + ### Line-by-Line Parsing + + # Scans the given line of text and figures out which fragment it belongs + # to. + # + # line - A String line of text from the email. + # + # Returns nothing. + def scan_line(line) + line.chomp!("\n") + line.lstrip! unless SIG_REGEX.match(line) + + # We're looking for leading `>`'s to see if this line is part of a + # quoted Fragment. + is_quoted = !!(line =~ /(>+)$/n) + + # Mark the current Fragment as a signature if the current line is empty + # and the Fragment starts with a common signature indicator. + if @fragment && line == EMPTY + if SIG_REGEX.match @fragment.lines.last + @fragment.signature = true + finish_fragment + end + end + + # If the line matches the current fragment, add it. Note that a common + # reply header also counts as part of the quoted Fragment, even though + # it doesn't start with `>`. + if @fragment && + ((@fragment.quoted? == is_quoted) || + (@fragment.quoted? && (quote_header?(line) || line == EMPTY))) + @fragment.lines << line + + # Otherwise, finish the fragment and start a new one. + else + finish_fragment + @fragment = Fragment.new(is_quoted, line) + end + end + + # Detects if a given line is a header above a quoted area. It is only + # checked for lines preceding quoted regions. + # + # line - A String line of text from the email. + # + # Returns true if the line is a valid header, or false. + def quote_header?(line) + line =~ /^:etorw.*nO$/n + end + + # Builds the fragment string and reverses it, after all lines have been + # added. It also checks to see if this Fragment is hidden. The hidden + # Fragment check reads from the bottom to the top. + # + # Any quoted Fragments or signature Fragments are marked hidden if they + # are below any visible Fragments. Visible Fragments are expected to + # contain original content by the author. If they are below a quoted + # Fragment, then the Fragment should be visible to give context to the + # reply. + # + # some original text (visible) + # + # > do you have any two's? (quoted, visible) + # + # Go fish! (visible) + # + # > -- + # > Player 1 (quoted, hidden) + # + # -- + # Player 2 (signature, hidden) + # + def finish_fragment + if @fragment + @fragment.finish + if !@found_visible + if @fragment.quoted? || @fragment.signature? || + @fragment.to_s.strip == EMPTY + @fragment.hidden = true + else + @found_visible = true + end + end + @fragments << @fragment + end + @fragment = nil + end + end + + ### Fragments + + # Represents a group of paragraphs in the email sharing common attributes. + # Paragraphs should get their own fragment if they are a quoted area or a + # signature. + class Fragment < Struct.new(:quoted, :signature, :hidden) + # This is an Array of String lines of content. Since the content is + # reversed, this array is backwards, and contains reversed strings. + attr_reader :lines, + + # This is reserved for the joined String that is build when this Fragment + # is finished. + :content + + def initialize(quoted, first_line) + self.signature = self.hidden = false + self.quoted = quoted + @lines = [first_line] + @content = nil + @lines.compact! + end + + alias quoted? quoted + alias signature? signature + alias hidden? hidden + + # Builds the string content by joining the lines and reversing them. + # + # Returns nothing. + def finish + @content = @lines.join("\n") + @lines = nil + @content.reverse! + end + + def to_s + @content + end + + def inspect + to_s.inspect + end + end +end diff --git a/lib/email/receiver.rb b/lib/email/receiver.rb index 07f89304e..eb4a84eb4 100644 --- a/lib/email/receiver.rb +++ b/lib/email/receiver.rb @@ -1,5 +1,6 @@ require_dependency 'new_post_manager' require_dependency 'email/html_cleaner' +require_dependency 'email/email_reply_parser' module Email diff --git a/spec/components/email/receiver_spec.rb b/spec/components/email/receiver_spec.rb index b4af327b6..59ecc55ee 100644 --- a/spec/components/email/receiver_spec.rb +++ b/spec/components/email/receiver_spec.rb @@ -100,53 +100,6 @@ It will also be my *only* reply." ) end - it "handles inline reply" do - expect(test_parse_body(fixture_file("emails/inline_reply.eml"))). - to eq( -"On Wed, Oct 8, 2014 at 11:12 AM, techAPJ wrote: - -> techAPJ -> November 28 -> -> Test reply. -> -> First paragraph. -> -> Second paragraph. -> -> To respond, reply to this email or visit -> https://meta.discourse.org/t/testing-default-email-replies/22638/3 in -> your browser. -> ------------------------------ -> Previous Replies codinghorror -> -> November 28 -> -> We're testing the latest GitHub email processing library which we are -> integrating now. -> -> https://github.com/github/email_reply_parser -> -> Go ahead and reply to this topic and I'll reply from various email clients -> for testing. -> ------------------------------ -> -> To respond, reply to this email or visit -> https://meta.discourse.org/t/testing-default-email-replies/22638/3 in -> your browser. -> -> To unsubscribe from these emails, visit your user preferences -> . -> - -The quick brown fox jumps over the lazy dog. The quick brown fox jumps over -the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown -fox jumps over the lazy dog. The quick brown fox jumps over the lazy dog. -The quick brown fox jumps over the lazy dog. The quick brown fox jumps over -the lazy dog. The quick brown fox jumps over the lazy dog." - ) - end - it "can retrieve the first part of multiple replies" do expect(test_parse_body(fixture_file("emails/inline_mixed.eml"))).to eq( "The quick brown fox jumps over the lazy dog. The quick brown fox jumps over @@ -173,6 +126,14 @@ the lazy dog. The quick brown fox jumps over the lazy dog. The quick brown" expect(test_parse_body(fixture_file("emails/iphone_signature.eml"))).not_to match(/Sent from my iPhone/) end + it "strips regular signature" do + expect(test_parse_body(fixture_file("emails/signature.eml"))).not_to match(/Arpit/) + end + + it "strips 'original message' context" do + expect(test_parse_body(fixture_file("emails/original_message_context.eml"))).not_to match(/Context/) + end + it "properly renders email reply from gmail web client" do expect(test_parse_body(fixture_file("emails/gmail_web.eml"))). to eq( diff --git a/spec/fixtures/emails/original_message_context.eml b/spec/fixtures/emails/original_message_context.eml new file mode 100644 index 000000000..31088c16e --- /dev/null +++ b/spec/fixtures/emails/original_message_context.eml @@ -0,0 +1,30 @@ +Delivered-To: test@mail.com +Return-Path: +From: Walter White +Content-Type: multipart/alternative; + boundary=Apple-Mail-8E182EEF-9DBC-41DE-A593-DF2E5EBD3975 +Content-Transfer-Encoding: 7bit +Mime-Version: 1.0 (1.0) +Subject: Re: Signature in email replies! +Date: Thu, 23 Oct 2014 14:43:49 +0530 +References: <1234@mail.gmail.com> +In-Reply-To: <1234@mail.gmail.com> +To: Arpit Jalan +X-Mailer: iPhone Mail (12A405) + + +--Apple-Mail-8E182EEF-9DBC-41DE-A593-DF2E5EBD3975 +Content-Type: text/plain; + charset=us-ascii +Content-Transfer-Encoding: 7bit + +This post should not include signature. +----Original Message---- + +Context here. + +> On 23-Oct-2014, at 9:45 am, Arpit Jalan wrote: +> +> Signature in email replies! + +--Apple-Mail-8E182EEF-9DBC-41DE-A593-DF2E5EBD3975 diff --git a/spec/fixtures/emails/signature.eml b/spec/fixtures/emails/signature.eml new file mode 100644 index 000000000..01a0dd787 --- /dev/null +++ b/spec/fixtures/emails/signature.eml @@ -0,0 +1,29 @@ +Delivered-To: test@mail.com +Return-Path: +From: Walter White +Content-Type: multipart/alternative; + boundary=Apple-Mail-8E182EEF-9DBC-41DE-A593-DF2E5EBD3975 +Content-Transfer-Encoding: 7bit +Mime-Version: 1.0 (1.0) +Subject: Re: Signature in email replies! +Date: Thu, 23 Oct 2014 14:43:49 +0530 +References: <1234@mail.gmail.com> +In-Reply-To: <1234@mail.gmail.com> +To: Arpit Jalan +X-Mailer: iPhone Mail (12A405) + + +--Apple-Mail-8E182EEF-9DBC-41DE-A593-DF2E5EBD3975 +Content-Type: text/plain; + charset=us-ascii +Content-Transfer-Encoding: 7bit + +This post should not include signature. + +----Arpit + +> On 23-Oct-2014, at 9:45 am, Arpit Jalan wrote: +> +> Signature in email replies! + +--Apple-Mail-8E182EEF-9DBC-41DE-A593-DF2E5EBD3975