deltachat/
dehtml.rs

1//! De-HTML.
2//!
3//! A module to remove HTML tags from the email text
4
5use std::io::BufRead;
6use std::sync::LazyLock;
7
8use quick_xml::{
9    Reader,
10    errors::Error as QuickXmlError,
11    events::{BytesEnd, BytesStart, BytesText},
12};
13
14use crate::simplify::{SimplifiedText, simplify_quote};
15
16#[derive(Default)]
17struct Dehtml {
18    strbuilder: String,
19    quote: String,
20    add_text: AddText,
21    last_href: Option<String>,
22    /// GMX wraps a quote in `<div name="quote">`. After a `<div name="quote">`, this count is
23    /// increased at each `<div>` and decreased at each `</div>`. This way we know when the quote ends.
24    /// If this is > `0`, then we are inside a `<div name="quote">`
25    divs_since_quote_div: u32,
26    /// Everything between `<div name="quote">` and `<div name="quoted-content">` is usually metadata
27    /// If this is > `0`, then we are inside a `<div name="quoted-content">`.
28    divs_since_quoted_content_div: u32,
29    /// `<div class="header-protection-legacy-display">` elements should be omitted, see
30    /// <https://www.rfc-editor.org/rfc/rfc9788.html#section-4.5.3.3>.
31    divs_since_hp_legacy_display: u32,
32    /// All-Inkl just puts the quote into `<blockquote> </blockquote>`. This count is
33    /// increased at each `<blockquote>` and decreased at each `</blockquote>`.
34    blockquotes_since_blockquote: u32,
35}
36
37impl Dehtml {
38    /// Returns true if HTML parser is currently inside the quote.
39    fn is_quote(&self) -> bool {
40        self.divs_since_quoted_content_div > 0 || self.blockquotes_since_blockquote > 0
41    }
42
43    /// Returns the buffer where the text should be written.
44    ///
45    /// If the parser is inside the quote, returns the quote buffer.
46    fn get_buf(&mut self) -> &mut String {
47        if self.is_quote() {
48            &mut self.quote
49        } else {
50            &mut self.strbuilder
51        }
52    }
53
54    fn get_add_text(&self) -> AddText {
55        // Everything between `<div name="quoted">` and `<div name="quoted_content">` is
56        // metadata which we don't want.
57        if self.divs_since_quote_div > 0 && self.divs_since_quoted_content_div == 0
58            || self.divs_since_hp_legacy_display > 0
59        {
60            AddText::No
61        } else {
62            self.add_text
63        }
64    }
65}
66
67#[derive(Debug, Default, PartialEq, Clone, Copy)]
68enum AddText {
69    /// Inside `<script>`, `<style>` and similar tags
70    /// which contents should not be displayed.
71    No,
72
73    #[default]
74    YesRemoveLineEnds,
75
76    /// Inside `<pre>`.
77    YesPreserveLineEnds,
78}
79
80pub(crate) fn dehtml(buf: &str) -> Option<SimplifiedText> {
81    let (s, quote) = dehtml_quick_xml(buf);
82    if !s.trim().is_empty() {
83        let text = dehtml_cleanup(s);
84        let top_quote = if !quote.trim().is_empty() {
85            Some(dehtml_cleanup(simplify_quote(&quote).0))
86        } else {
87            None
88        };
89        return Some(SimplifiedText {
90            text,
91            top_quote,
92            ..Default::default()
93        });
94    }
95    let s = dehtml_manually(buf);
96    if !s.trim().is_empty() {
97        let text = dehtml_cleanup(s);
98        return Some(SimplifiedText {
99            text,
100            ..Default::default()
101        });
102    }
103    None
104}
105
106fn dehtml_cleanup(mut text: String) -> String {
107    text.retain(|c| c != '\r');
108    let lines = text.trim().split('\n');
109    let mut text = String::new();
110    let mut linebreak = false;
111    for line in lines {
112        if line.chars().all(char::is_whitespace) {
113            linebreak = true;
114        } else {
115            if !text.is_empty() {
116                text += "\n";
117                if linebreak {
118                    text += "\n";
119                }
120            }
121            text += line.trim_end();
122            linebreak = false;
123        }
124    }
125    text
126}
127
128fn dehtml_quick_xml(buf: &str) -> (String, String) {
129    let buf = buf.trim().trim_start_matches("<!doctype html>");
130
131    let mut dehtml = Dehtml {
132        strbuilder: String::with_capacity(buf.len()),
133        ..Default::default()
134    };
135
136    let mut reader = quick_xml::Reader::from_str(buf);
137    reader.config_mut().check_end_names = false;
138
139    let mut buf = Vec::new();
140    let mut char_buf = String::with_capacity(4);
141
142    loop {
143        match reader.read_event_into(&mut buf) {
144            Ok(quick_xml::events::Event::Start(ref e)) => {
145                dehtml_starttag_cb(e, &mut dehtml, &reader)
146            }
147            Ok(quick_xml::events::Event::End(ref e)) => dehtml_endtag_cb(e, &mut dehtml),
148            Ok(quick_xml::events::Event::Text(ref e)) => dehtml_text_cb(e, &mut dehtml),
149            Ok(quick_xml::events::Event::CData(e)) => {
150                str_cb(&String::from_utf8_lossy(&e as &[_]), &mut dehtml)
151            }
152            Ok(quick_xml::events::Event::Empty(ref e)) => {
153                // Handle empty tags as a start tag immediately followed by end tag.
154                // For example, `<p/>` is treated as `<p></p>`.
155                dehtml_starttag_cb(e, &mut dehtml, &reader);
156                dehtml_endtag_cb(
157                    &BytesEnd::new(String::from_utf8_lossy(e.name().as_ref())),
158                    &mut dehtml,
159                );
160            }
161            Ok(quick_xml::events::Event::GeneralRef(ref e)) => {
162                match e.resolve_char_ref() {
163                    Err(err) => eprintln!(
164                        "resolve_char_ref() error at position {}: {:?}",
165                        reader.buffer_position(),
166                        err,
167                    ),
168                    Ok(Some(ch)) => {
169                        char_buf.clear();
170                        char_buf.push(ch);
171                        str_cb(&char_buf, &mut dehtml);
172                    }
173                    Ok(None) => {
174                        let event_str = String::from_utf8_lossy(e);
175                        if let Some(s) = quick_xml::escape::resolve_html5_entity(&event_str) {
176                            str_cb(s, &mut dehtml);
177                        } else {
178                            // Nonstandard entity. Add escaped.
179                            str_cb(&format!("&{event_str};"), &mut dehtml);
180                        }
181                    }
182                }
183            }
184            Err(QuickXmlError::IllFormed(_)) => {
185                // This is probably not HTML at all and should be left as is.
186                str_cb(&String::from_utf8_lossy(&buf), &mut dehtml);
187            }
188            Err(e) => {
189                eprintln!(
190                    "Parse html error: Error at position {}: {:?}",
191                    reader.buffer_position(),
192                    e
193                );
194            }
195            Ok(quick_xml::events::Event::Eof) => break,
196            _ => (),
197        }
198        buf.clear();
199    }
200
201    (dehtml.strbuilder, dehtml.quote)
202}
203
204fn dehtml_text_cb(event: &BytesText, dehtml: &mut Dehtml) {
205    if dehtml.get_add_text() == AddText::YesPreserveLineEnds
206        || dehtml.get_add_text() == AddText::YesRemoveLineEnds
207    {
208        let event = event as &[_];
209        let event_str = std::str::from_utf8(event).unwrap_or_default();
210        str_cb(event_str, dehtml);
211    }
212}
213
214fn str_cb(event_str: &str, dehtml: &mut Dehtml) {
215    static LINE_RE: LazyLock<regex::Regex> =
216        LazyLock::new(|| regex::Regex::new(r"(\r?\n)+").unwrap());
217
218    let add_text = dehtml.get_add_text();
219    if add_text == AddText::YesRemoveLineEnds {
220        // Replace all line ends with spaces.
221        // E.g. `\r\n\r\n` is replaced with one space.
222        let event_str = LINE_RE.replace_all(event_str, " ");
223
224        // Add a space if `event_str` starts with a space
225        // and there is no whitespace at the end of the buffer yet.
226        // Trim the rest of leading whitespace from `event_str`.
227        let buf = dehtml.get_buf();
228        if !buf.ends_with(' ') && !buf.ends_with('\n') && event_str.starts_with(' ') {
229            *buf += " ";
230        }
231
232        *buf += event_str.trim_start();
233    } else if add_text == AddText::YesPreserveLineEnds {
234        *dehtml.get_buf() += LINE_RE.replace_all(event_str, "\n").as_ref();
235    }
236}
237
238#[expect(clippy::arithmetic_side_effects)]
239fn dehtml_endtag_cb(event: &BytesEnd, dehtml: &mut Dehtml) {
240    let tag = String::from_utf8_lossy(event.name().as_ref())
241        .trim()
242        .to_lowercase();
243
244    match tag.as_str() {
245        "style" | "script" | "title" | "pre" => {
246            *dehtml.get_buf() += "\n\n";
247            dehtml.add_text = AddText::YesRemoveLineEnds;
248        }
249        "div" => {
250            pop_tag(&mut dehtml.divs_since_quote_div);
251            pop_tag(&mut dehtml.divs_since_quoted_content_div);
252            pop_tag(&mut dehtml.divs_since_hp_legacy_display);
253
254            *dehtml.get_buf() += "\n\n";
255            dehtml.add_text = AddText::YesRemoveLineEnds;
256        }
257        "a" => {
258            if let Some(ref last_href) = dehtml.last_href.take() {
259                let buf = dehtml.get_buf();
260                if buf.ends_with('[') {
261                    buf.truncate(buf.len() - 1);
262                } else {
263                    *buf += "](";
264                    *buf += last_href;
265                    *buf += ")";
266                }
267            }
268        }
269        "b" | "strong" => {
270            if dehtml.get_add_text() != AddText::No {
271                *dehtml.get_buf() += "*";
272            }
273        }
274        "i" | "em" => {
275            if dehtml.get_add_text() != AddText::No {
276                *dehtml.get_buf() += "_";
277            }
278        }
279        "blockquote" => pop_tag(&mut dehtml.blockquotes_since_blockquote),
280        _ => {}
281    }
282}
283
284#[expect(clippy::arithmetic_side_effects)]
285fn dehtml_starttag_cb<B: std::io::BufRead>(
286    event: &BytesStart,
287    dehtml: &mut Dehtml,
288    reader: &quick_xml::Reader<B>,
289) {
290    let tag = String::from_utf8_lossy(event.name().as_ref())
291        .trim()
292        .to_lowercase();
293
294    match tag.as_str() {
295        "p" | "table" | "td" => {
296            if !dehtml.strbuilder.is_empty() {
297                *dehtml.get_buf() += "\n\n";
298            }
299            dehtml.add_text = AddText::YesRemoveLineEnds;
300        }
301        #[rustfmt::skip]
302        "div" => {
303            maybe_push_tag(event, reader, "quote", &mut dehtml.divs_since_quote_div);
304            maybe_push_tag(event, reader, "quoted-content", &mut dehtml.divs_since_quoted_content_div);
305            maybe_push_tag(event, reader, "header-protection-legacy-display",
306                &mut dehtml.divs_since_hp_legacy_display);
307
308            *dehtml.get_buf() += "\n\n";
309            dehtml.add_text = AddText::YesRemoveLineEnds;
310        }
311        "br" => {
312            *dehtml.get_buf() += "\n";
313            dehtml.add_text = AddText::YesRemoveLineEnds;
314        }
315        "style" | "script" | "title" => {
316            dehtml.add_text = AddText::No;
317        }
318        "pre" => {
319            *dehtml.get_buf() += "\n\n";
320            dehtml.add_text = AddText::YesPreserveLineEnds;
321        }
322        "a" => {
323            if let Some(href) = event
324                .html_attributes()
325                .filter_map(|attr| attr.ok())
326                .find(|attr| {
327                    String::from_utf8_lossy(attr.key.as_ref())
328                        .trim()
329                        .to_lowercase()
330                        == "href"
331                })
332            {
333                let href = href
334                    .decode_and_unescape_value(reader.decoder())
335                    .unwrap_or_default()
336                    .to_string();
337
338                if !href.is_empty() {
339                    dehtml.last_href = Some(href);
340                    *dehtml.get_buf() += "[";
341                }
342            }
343        }
344        "b" | "strong" => {
345            if dehtml.get_add_text() != AddText::No {
346                *dehtml.get_buf() += "*";
347            }
348        }
349        "i" | "em" => {
350            if dehtml.get_add_text() != AddText::No {
351                *dehtml.get_buf() += "_";
352            }
353        }
354        "blockquote" => dehtml.blockquotes_since_blockquote += 1,
355        _ => {}
356    }
357}
358
359/// In order to know when a specific tag is closed, we need to count the opening and closing tags.
360/// The `counts`s are stored in the `Dehtml` struct.
361#[expect(clippy::arithmetic_side_effects)]
362fn pop_tag(count: &mut u32) {
363    if *count > 0 {
364        *count -= 1;
365    }
366}
367
368/// In order to know when a specific tag is closed, we need to count the opening and closing tags.
369/// The `counts`s are stored in the `Dehtml` struct.
370#[expect(clippy::arithmetic_side_effects)]
371fn maybe_push_tag(
372    event: &BytesStart,
373    reader: &Reader<impl BufRead>,
374    tag_name: &str,
375    count: &mut u32,
376) {
377    if *count > 0 || tag_contains_attr(event, reader, tag_name) {
378        *count += 1;
379    }
380}
381
382fn tag_contains_attr(event: &BytesStart, reader: &Reader<impl BufRead>, name: &str) -> bool {
383    event.attributes().any(|r| {
384        r.map(|a| {
385            a.decode_and_unescape_value(reader.decoder())
386                .map(|v| v == name)
387                .unwrap_or(false)
388        })
389        .unwrap_or(false)
390    })
391}
392
393pub fn dehtml_manually(buf: &str) -> String {
394    // Just strip out everything between "<" and ">"
395    let mut strbuilder = String::new();
396    let mut show_next_chars = true;
397    for c in buf.chars() {
398        match c {
399            '<' => show_next_chars = false,
400            '>' => show_next_chars = true,
401            _ => {
402                if show_next_chars {
403                    strbuilder.push(c)
404                }
405            }
406        }
407    }
408    strbuilder
409}
410
411#[cfg(test)]
412mod tests {
413    use super::*;
414
415    #[test]
416    fn test_dehtml() {
417        let cases = vec![
418            (
419                "<a href='https://example.com'> Foo </a>",
420                "[ Foo ](https://example.com)",
421            ),
422            ("<b> bar </b>", "* bar *"),
423            ("<i>foo</i>", "_foo_"),
424            ("<b> bar <i> foo", "* bar _ foo"),
425            ("&amp; bar", "& bar"),
426            // Despite missing ', this should be shown:
427            ("<a href='/foo.png>Hi</a> ", "Hi"),
428            ("No link: <a href='https://get.delta.chat/'/>", "No link:"),
429            (
430                "No link: <a href='https://get.delta.chat/'></a>",
431                "No link:",
432            ),
433            ("<!doctype html>\n<b>fat text</b>", "*fat text*"),
434            // Invalid html (at least DC should show the text if the html is invalid):
435            ("<!some invalid html code>\n<b>some text</b>", "some text"),
436        ];
437        for (input, output) in cases {
438            assert_eq!(dehtml(input).unwrap().text, output);
439        }
440        let none_cases = vec!["<html> </html>", ""];
441        for input in none_cases {
442            assert_eq!(dehtml(input), None);
443        }
444    }
445
446    #[test]
447    fn test_dehtml_parse_br() {
448        let html = "line1<br>line2";
449        let plain = dehtml(html).unwrap().text;
450        assert_eq!(plain, "line1\nline2");
451
452        let html = "line1<br> line2";
453        let plain = dehtml(html).unwrap().text;
454        assert_eq!(plain, "line1\nline2");
455
456        let html = "line1  <br><br> line2";
457        let plain = dehtml(html).unwrap().text;
458        assert_eq!(plain, "line1\n\nline2");
459
460        let html = "\r\r\nline1<br>\r\n\r\n\r\rline2<br/>line3\n\r";
461        let plain = dehtml(html).unwrap().text;
462        assert_eq!(plain, "line1\nline2\nline3");
463    }
464
465    #[test]
466    fn test_dehtml_parse_span() {
467        assert_eq!(dehtml("<span>Foo</span>bar").unwrap().text, "Foobar");
468        assert_eq!(dehtml("<span>Foo</span> bar").unwrap().text, "Foo bar");
469        assert_eq!(dehtml("<span>Foo </span>bar").unwrap().text, "Foo bar");
470        assert_eq!(dehtml("<span>Foo</span>\nbar").unwrap().text, "Foo bar");
471        assert_eq!(dehtml("\n<span>Foo</span> bar").unwrap().text, "Foo bar");
472        assert_eq!(dehtml("<span>Foo</span>\n\nbar").unwrap().text, "Foo bar");
473        assert_eq!(dehtml("Foo\n<span>bar</span>").unwrap().text, "Foo bar");
474        assert_eq!(dehtml("Foo<span>\nbar</span>").unwrap().text, "Foo bar");
475    }
476
477    #[test]
478    fn test_dehtml_parse_p() {
479        let html = "<p>Foo</p><p>Bar</p>";
480        let plain = dehtml(html).unwrap().text;
481        assert_eq!(plain, "Foo\n\nBar");
482
483        let html = "<p>Foo<p>Bar";
484        let plain = dehtml(html).unwrap().text;
485        assert_eq!(plain, "Foo\n\nBar");
486
487        let html = "<p>Foo</p><p>Bar<p>Baz";
488        let plain = dehtml(html).unwrap().text;
489        assert_eq!(plain, "Foo\n\nBar\n\nBaz");
490    }
491
492    #[test]
493    fn test_dehtml_parse_href() {
494        let html = "<a href=url>text</a>";
495        let plain = dehtml(html).unwrap().text;
496
497        assert_eq!(plain, "[text](url)");
498    }
499
500    #[test]
501    fn test_dehtml_case_sensitive_link() {
502        let html = "<html><A HrEf=\"https://foo.bar/Data\">case in URLs matter</A></html>";
503        let plain = dehtml(html).unwrap().text;
504        assert_eq!(plain, "[case in URLs matter](https://foo.bar/Data)");
505    }
506
507    #[test]
508    fn test_dehtml_bold_text() {
509        let html = "<!DOCTYPE name [<!DOCTYPE ...>]><!-- comment -->text <b><?php echo ... ?>bold</b><![CDATA[<>]]>";
510        let plain = dehtml(html).unwrap().text;
511
512        assert_eq!(plain, "text *bold*<>");
513    }
514
515    #[test]
516    fn test_dehtml_html_encoded() {
517        let html = "&lt;&gt;&quot;&apos;&amp; &auml;&Auml;&ouml;&Ouml;&uuml;&Uuml;&szlig; foo&AElig;&ccedil;&Ccedil; &diams;&lrm;&rlm;&zwnj;&noent;&zwj;";
518
519        let plain = dehtml(html).unwrap().text;
520
521        assert_eq!(
522            plain,
523            "<>\"\'& äÄöÖüÜß fooÆçÇ \u{2666}\u{200e}\u{200f}\u{200c}&noent;\u{200d}"
524        );
525    }
526
527    #[test]
528    fn test_unclosed_tags() {
529        let input = r##"
530        <!DOCTYPE HTML PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN'
531        'http://www.w3.org/TR/html4/loose.dtd'>
532        <html>
533        <head>
534        <title>Hi</title>
535        <meta http-equiv='Content-Type' content='text/html; charset=iso-8859-1'>						
536        </head>
537        <body>
538        lots of text
539        </body>
540        </html>
541        "##;
542        let txt = dehtml(input).unwrap();
543        assert_eq!(txt.text.trim(), "lots of text");
544    }
545
546    #[test]
547    fn test_pre_tag() {
548        let input = "<html><pre>\ntwo\nlines\n</pre></html>";
549        let txt = dehtml(input).unwrap();
550        assert_eq!(txt.text.trim(), "two\nlines");
551    }
552
553    #[test]
554    fn test_hp_legacy_display() {
555        let input = r#"
556<html><head><title></title></head><body>
557<div class="header-protection-legacy-display">
558<pre>Subject: Dinner plans</pre>
559</div>
560<p>
561Let's meet at Rama's Roti Shop at 8pm and go to the park
562from there.
563</p>
564</body>
565</html>
566        "#;
567        let txt = dehtml(input).unwrap();
568        assert_eq!(
569            txt.text.trim(),
570            "Let's meet at Rama's Roti Shop at 8pm and go to the park from there."
571        );
572    }
573
574    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
575    async fn test_quote_div() {
576        let input = include_str!("../test-data/message/gmx-quote-body.eml");
577        let dehtml = dehtml(input).unwrap();
578        let SimplifiedText {
579            text,
580            is_forwarded,
581            is_cut,
582            top_quote,
583            footer,
584        } = dehtml;
585        assert_eq!(text, "Test");
586        assert_eq!(is_forwarded, false);
587        assert_eq!(is_cut, false);
588        assert_eq!(top_quote.as_deref(), Some("test"));
589        assert_eq!(footer, None);
590    }
591
592    #[test]
593    fn test_spaces() {
594        let input = include_str!("../test-data/spaces.html");
595        let txt = dehtml(input).unwrap();
596        assert_eq!(
597            txt.text,
598            "Welcome back to Strolling!\n\nHey there,\n\nWelcome back! Use this link to securely sign in to your Strolling account:\n\nSign in to Strolling\n\nFor your security, the link will expire in 24 hours time.\n\nSee you soon!\n\nYou can also copy & paste this URL into your browser:\n\nhttps://strolling.rosano.ca/members/?token=XXX&action=signin&r=https%3A%2F%2Fstrolling.rosano.ca%2F\n\nIf you did not make this request, you can safely ignore this email.\n\nThis message was sent from [strolling.rosano.ca](https://strolling.rosano.ca/) to [alice@example.org](mailto:alice@example.org)"
599        );
600    }
601}