Skip to content

Commit 87bed2e

Browse files
committed
fix(message-extractor): improve handling of forwarded messages and enhance text cleanup
1 parent 60ebe8d commit 87bed2e

2 files changed

Lines changed: 101 additions & 10 deletions

File tree

legacy/core/src/main/java/com/fsck/k9/message/extractors/PreviewTextExtractor.kt

Lines changed: 55 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import com.fsck.k9.message.html.EmailSection
77
import com.fsck.k9.message.html.EmailSectionExtractor
88
import com.fsck.k9.message.html.HtmlConverter
99

10+
@Suppress("TooManyFunctions")
1011
internal class PreviewTextExtractor {
1112
@Throws(PreviewExtractionException::class)
1213
fun extractPreview(textPart: Part): String {
@@ -32,18 +33,13 @@ internal class PreviewTextExtractor {
3233
intermediateText = stripSignature(intermediateText)
3334
intermediateText = extractUnquotedText(intermediateText)
3435

35-
// try to remove lines of dashes in the preview
36-
intermediateText = intermediateText.replace("(?m)^----.*?$".toRegex(), "")
37-
// Remove horizontal rules.
38-
intermediateText = intermediateText.replace("\\s*([-=_]{30,}+)\\s*".toRegex(), " ")
39-
36+
// Run line-based cleanup before HTML normalization, so that we don't remove line breaks in HTML
37+
intermediateText = stripLineBasedArtifacts(intermediateText)
4038
// Always parse the text as HTML, independently of the mimetype
4139
intermediateText = HtmlConverter.htmlToText(intermediateText)
42-
// Remove parsed HTML links/images "<url>", "( url )", "(url)", etc.
43-
intermediateText = intermediateText.replace("[(<]\\s?https?://\\S+[^)>]\\s?[>)]".toRegex(), " ")
44-
45-
// Remove invisible formatting characters that can otherwise dominate previews.
46-
intermediateText = intermediateText.replace("[\\u034F\\u200B-\\u200D\\uFEFF]".toRegex(), "")
40+
intermediateText = stripLineBasedArtifacts(intermediateText)
41+
intermediateText = removeParsedHtmlUrls(intermediateText)
42+
intermediateText = removeInvisibleFormattingCharacters(intermediateText)
4743

4844
// URLs in the preview should just be shown as "..." - They're not
4945
// clickable and they usually overwhelm the preview
@@ -62,6 +58,45 @@ internal class PreviewTextExtractor {
6258
}
6359
}
6460

61+
private fun stripLineBasedArtifacts(text: String): String {
62+
var strippedText = text
63+
64+
strippedText = stripForwardedMessageMetadata(strippedText)
65+
strippedText = stripDashLines(strippedText)
66+
strippedText = stripHorizontalRules(strippedText)
67+
68+
return strippedText
69+
}
70+
71+
private fun stripForwardedMessageMetadata(text: String): String {
72+
return REGEX_FORWARDED_MESSAGE_HEADER_BLOCK.replace(text) { matchResult ->
73+
val hasContentBefore = text.substring(0, matchResult.range.first).isNotBlank()
74+
val hasContentAfter = text.substring(matchResult.range.last + 1).isNotBlank()
75+
76+
if (hasContentBefore && hasContentAfter) {
77+
" $PREVIEW_SECTION_SEPARATOR "
78+
} else {
79+
" "
80+
}
81+
}
82+
}
83+
84+
private fun stripDashLines(text: String): String {
85+
return text.replace(REGEX_DASH_LINE, "")
86+
}
87+
88+
private fun stripHorizontalRules(text: String): String {
89+
return text.replace(REGEX_HORIZONTAL_RULE, " ")
90+
}
91+
92+
private fun removeParsedHtmlUrls(text: String): String {
93+
return text.replace(REGEX_PARSED_HTML_URL, " ")
94+
}
95+
96+
private fun removeInvisibleFormattingCharacters(text: String): String {
97+
return text.replace(REGEX_INVISIBLE_FORMATTING_CHARACTERS, "")
98+
}
99+
65100
private fun normalizeLineBreaks(text: String) = text.replace(REGEX_CRLF, "\n")
66101

67102
private fun stripSignature(text: String): String {
@@ -129,7 +164,17 @@ internal class PreviewTextExtractor {
129164
companion object {
130165
private const val MAX_PREVIEW_LENGTH = 512
131166
private const val MAX_CHARACTERS_CHECKED_FOR_PREVIEW = 8192L
167+
private const val PREVIEW_SECTION_SEPARATOR = "[…]"
132168

133169
private val REGEX_CRLF = "(\\r\\n|\\r)".toRegex()
170+
private val REGEX_DASH_LINE = "(?m)^-{4,}\\s*$".toRegex()
171+
private val REGEX_HORIZONTAL_RULE = "\\s*([-=_]{30,}+)\\s*".toRegex()
172+
private val REGEX_PARSED_HTML_URL = "[(<]\\s?https?://\\S+[^)>]\\s?[>)]".toRegex()
173+
private val REGEX_INVISIBLE_FORMATTING_CHARACTERS = "[\\u034F\\u200B-\\u200D\\uFEFF]".toRegex()
174+
private val REGEX_FORWARDED_MESSAGE_HEADER_BLOCK = (
175+
"(?im)^[\\t -]*Original Message[\\t -]*\\n" +
176+
"(?:[\\t ]*[A-Za-z][A-Za-z-]*:.*\\n)+" +
177+
"[\\t ]*\\n?"
178+
).toRegex()
134179
}
135180
}

legacy/core/src/test/java/com/fsck/k9/message/extractors/PreviewTextExtractorTest.kt

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,52 @@ class PreviewTextExtractorTest {
222222
assertThat(preview).isEqualTo("some image:")
223223
}
224224

225+
@Test
226+
fun extractPreview_forwardedMessage() {
227+
val text =
228+
"""
229+
|Here is the forwarded message:
230+
|
231+
|-----Original Message-----
232+
|From: alice@example.com
233+
|Sent: Monday, January 1, 2024 10:00 AM
234+
|To: bob@example.com
235+
|Subject: Hello
236+
|
237+
|This is the original content.
238+
""".trimMargin()
239+
val part = MessageCreationHelper.createTextPart("text/plain", text)
240+
241+
val preview = previewTextExtractor.extractPreview(part)
242+
243+
assertThat(preview).isEqualTo("Here is the forwarded message: […] This is the original content.")
244+
}
245+
246+
@Test
247+
fun extractPreview_withHtmlForwardedMessageAsTextPlain() {
248+
val text =
249+
"""
250+
|<html>
251+
|<body>
252+
|Here is the forwarded message:<br>
253+
|<br>
254+
|-----Original Message-----<br>
255+
|From: alice@example.com<br>
256+
|Sent: Monday, January 1, 2024 10:00 AM<br>
257+
|To: bob@example.com<br>
258+
|Subject: Hello<br>
259+
|<br>
260+
|This is the original content.
261+
|</body>
262+
|</html>
263+
""".trimMargin()
264+
val part = MessageCreationHelper.createTextPart("text/plain", text)
265+
266+
val preview = previewTextExtractor.extractPreview(part)
267+
268+
assertThat(preview).isEqualTo("Here is the forwarded message: […] This is the original content.")
269+
}
270+
225271
@Test
226272
fun extractPreview_shouldCollapseAndTrimWhitespace() {
227273
val text = " whitespace is\t\tfun "

0 commit comments

Comments
 (0)