@@ -7,6 +7,7 @@ import com.fsck.k9.message.html.EmailSection
77import com.fsck.k9.message.html.EmailSectionExtractor
88import com.fsck.k9.message.html.HtmlConverter
99
10+ @Suppress(" TooManyFunctions" )
1011internal class PreviewTextExtractor {
1112 @Throws(PreviewExtractionException ::class )
1213 fun extractPreview (textPart : Part ): String {
@@ -32,18 +33,13 @@ internal class PreviewTextExtractor {
3233 intermediateText = stripSignature(intermediateText)
3334 intermediateText = extractUnquotedText(intermediateText)
3435
35- // try to remove lines of dashes in the preview
36- intermediateText = intermediateText.replace(" (?m)^----.*?$" .toRegex(), " " )
37- // Remove horizontal rules.
38- intermediateText = intermediateText.replace(" \\ s*([-=_]{30,}+)\\ s*" .toRegex(), " " )
39-
36+ // Run line-based cleanup before HTML normalization, so that we don't remove line breaks in HTML
37+ intermediateText = stripLineBasedArtifacts(intermediateText)
4038 // Always parse the text as HTML, independently of the mimetype
4139 intermediateText = HtmlConverter .htmlToText(intermediateText)
42- // Remove parsed HTML links/images "<url>", "( url )", "(url)", etc.
43- intermediateText = intermediateText.replace(" [(<]\\ s?https?://\\ S+[^)>]\\ s?[>)]" .toRegex(), " " )
44-
45- // Remove invisible formatting characters that can otherwise dominate previews.
46- intermediateText = intermediateText.replace(" [\\ u034F\\ u200B-\\ u200D\\ uFEFF]" .toRegex(), " " )
40+ intermediateText = stripLineBasedArtifacts(intermediateText)
41+ intermediateText = removeParsedHtmlUrls(intermediateText)
42+ intermediateText = removeInvisibleFormattingCharacters(intermediateText)
4743
4844 // URLs in the preview should just be shown as "..." - They're not
4945 // clickable and they usually overwhelm the preview
@@ -62,6 +58,45 @@ internal class PreviewTextExtractor {
6258 }
6359 }
6460
61+ private fun stripLineBasedArtifacts (text : String ): String {
62+ var strippedText = text
63+
64+ strippedText = stripForwardedMessageMetadata(strippedText)
65+ strippedText = stripDashLines(strippedText)
66+ strippedText = stripHorizontalRules(strippedText)
67+
68+ return strippedText
69+ }
70+
71+ private fun stripForwardedMessageMetadata (text : String ): String {
72+ return REGEX_FORWARDED_MESSAGE_HEADER_BLOCK .replace(text) { matchResult ->
73+ val hasContentBefore = text.substring(0 , matchResult.range.first).isNotBlank()
74+ val hasContentAfter = text.substring(matchResult.range.last + 1 ).isNotBlank()
75+
76+ if (hasContentBefore && hasContentAfter) {
77+ " $PREVIEW_SECTION_SEPARATOR "
78+ } else {
79+ " "
80+ }
81+ }
82+ }
83+
84+ private fun stripDashLines (text : String ): String {
85+ return text.replace(REGEX_DASH_LINE , " " )
86+ }
87+
88+ private fun stripHorizontalRules (text : String ): String {
89+ return text.replace(REGEX_HORIZONTAL_RULE , " " )
90+ }
91+
92+ private fun removeParsedHtmlUrls (text : String ): String {
93+ return text.replace(REGEX_PARSED_HTML_URL , " " )
94+ }
95+
96+ private fun removeInvisibleFormattingCharacters (text : String ): String {
97+ return text.replace(REGEX_INVISIBLE_FORMATTING_CHARACTERS , " " )
98+ }
99+
65100 private fun normalizeLineBreaks (text : String ) = text.replace(REGEX_CRLF , " \n " )
66101
67102 private fun stripSignature (text : String ): String {
@@ -129,7 +164,17 @@ internal class PreviewTextExtractor {
129164 companion object {
130165 private const val MAX_PREVIEW_LENGTH = 512
131166 private const val MAX_CHARACTERS_CHECKED_FOR_PREVIEW = 8192L
167+ private const val PREVIEW_SECTION_SEPARATOR = " […]"
132168
133169 private val REGEX_CRLF = " (\\ r\\ n|\\ r)" .toRegex()
170+ private val REGEX_DASH_LINE = " (?m)^-{4,}\\ s*$" .toRegex()
171+ private val REGEX_HORIZONTAL_RULE = " \\ s*([-=_]{30,}+)\\ s*" .toRegex()
172+ private val REGEX_PARSED_HTML_URL = " [(<]\\ s?https?://\\ S+[^)>]\\ s?[>)]" .toRegex()
173+ private val REGEX_INVISIBLE_FORMATTING_CHARACTERS = " [\\ u034F\\ u200B-\\ u200D\\ uFEFF]" .toRegex()
174+ private val REGEX_FORWARDED_MESSAGE_HEADER_BLOCK = (
175+ " (?im)^[\\ t -]*Original Message[\\ t -]*\\ n" +
176+ " (?:[\\ t ]*[A-Za-z][A-Za-z-]*:.*\\ n)+" +
177+ " [\\ t ]*\\ n?"
178+ ).toRegex()
134179 }
135180}
0 commit comments