deltachat/
dehtml.rs

1//! De-HTML.
2//!
3//! A module to remove HTML tags from the email text
4
5use std::io::BufRead;
6use std::sync::LazyLock;
7
8use quick_xml::{
9    events::{BytesEnd, BytesStart, BytesText},
10    Reader,
11};
12
13use crate::simplify::{simplify_quote, SimplifiedText};
14
15struct Dehtml {
16    strbuilder: String,
17    quote: String,
18    add_text: AddText,
19    last_href: Option<String>,
20    /// GMX wraps a quote in `<div name="quote">`. After a `<div name="quote">`, this count is
21    /// increased at each `<div>` and decreased at each `</div>`. This way we know when the quote ends.
22    /// If this is > `0`, then we are inside a `<div name="quote">`
23    divs_since_quote_div: u32,
24    /// Everything between `<div name="quote">` and `<div name="quoted-content">` is usually metadata
25    /// If this is > `0`, then we are inside a `<div name="quoted-content">`.
26    divs_since_quoted_content_div: u32,
27    /// All-Inkl just puts the quote into `<blockquote> </blockquote>`. This count is
28    /// increased at each `<blockquote>` and decreased at each `</blockquote>`.
29    blockquotes_since_blockquote: u32,
30}
31
32impl Dehtml {
33    /// Returns true if HTML parser is currently inside the quote.
34    fn is_quote(&self) -> bool {
35        self.divs_since_quoted_content_div > 0 || self.blockquotes_since_blockquote > 0
36    }
37
38    /// Returns the buffer where the text should be written.
39    ///
40    /// If the parser is inside the quote, returns the quote buffer.
41    fn get_buf(&mut self) -> &mut String {
42        if self.is_quote() {
43            &mut self.quote
44        } else {
45            &mut self.strbuilder
46        }
47    }
48
49    fn get_add_text(&self) -> AddText {
50        if self.divs_since_quote_div > 0 && self.divs_since_quoted_content_div == 0 {
51            AddText::No // Everything between `<div name="quoted">` and `<div name="quoted_content">` is metadata which we don't want
52        } else {
53            self.add_text
54        }
55    }
56}
57
58#[derive(Debug, PartialEq, Clone, Copy)]
59enum AddText {
60    /// Inside `<script>`, `<style>` and similar tags
61    /// which contents should not be displayed.
62    No,
63
64    YesRemoveLineEnds,
65
66    /// Inside `<pre>`.
67    YesPreserveLineEnds,
68}
69
70pub(crate) fn dehtml(buf: &str) -> Option<SimplifiedText> {
71    let (s, quote) = dehtml_quick_xml(buf);
72    if !s.trim().is_empty() {
73        let text = dehtml_cleanup(s);
74        let top_quote = if !quote.trim().is_empty() {
75            Some(dehtml_cleanup(simplify_quote(&quote).0))
76        } else {
77            None
78        };
79        return Some(SimplifiedText {
80            text,
81            top_quote,
82            ..Default::default()
83        });
84    }
85    let s = dehtml_manually(buf);
86    if !s.trim().is_empty() {
87        let text = dehtml_cleanup(s);
88        return Some(SimplifiedText {
89            text,
90            ..Default::default()
91        });
92    }
93    None
94}
95
96fn dehtml_cleanup(mut text: String) -> String {
97    text.retain(|c| c != '\r');
98    let lines = text.trim().split('\n');
99    let mut text = String::new();
100    let mut linebreak = false;
101    for line in lines {
102        if line.chars().all(char::is_whitespace) {
103            linebreak = true;
104        } else {
105            if !text.is_empty() {
106                text += "\n";
107                if linebreak {
108                    text += "\n";
109                }
110            }
111            text += line.trim_end();
112            linebreak = false;
113        }
114    }
115    text
116}
117
118fn dehtml_quick_xml(buf: &str) -> (String, String) {
119    let buf = buf.trim().trim_start_matches("<!doctype html>");
120
121    let mut dehtml = Dehtml {
122        strbuilder: String::with_capacity(buf.len()),
123        quote: String::new(),
124        add_text: AddText::YesRemoveLineEnds,
125        last_href: None,
126        divs_since_quote_div: 0,
127        divs_since_quoted_content_div: 0,
128        blockquotes_since_blockquote: 0,
129    };
130
131    let mut reader = quick_xml::Reader::from_str(buf);
132    reader.config_mut().check_end_names = false;
133
134    let mut buf = Vec::new();
135
136    loop {
137        match reader.read_event_into(&mut buf) {
138            Ok(quick_xml::events::Event::Start(ref e)) => {
139                dehtml_starttag_cb(e, &mut dehtml, &reader)
140            }
141            Ok(quick_xml::events::Event::End(ref e)) => dehtml_endtag_cb(e, &mut dehtml),
142            Ok(quick_xml::events::Event::Text(ref e)) => dehtml_text_cb(e, &mut dehtml),
143            Ok(quick_xml::events::Event::CData(e)) => match e.escape() {
144                Ok(e) => dehtml_text_cb(&e, &mut dehtml),
145                Err(e) => {
146                    eprintln!(
147                        "CDATA escape error at position {}: {:?}",
148                        reader.buffer_position(),
149                        e,
150                    );
151                }
152            },
153            Ok(quick_xml::events::Event::Empty(ref e)) => {
154                // Handle empty tags as a start tag immediately followed by end tag.
155                // For example, `<p/>` is treated as `<p></p>`.
156                dehtml_starttag_cb(e, &mut dehtml, &reader);
157                dehtml_endtag_cb(
158                    &BytesEnd::new(String::from_utf8_lossy(e.name().as_ref())),
159                    &mut dehtml,
160                );
161            }
162            Err(e) => {
163                eprintln!(
164                    "Parse html error: Error at position {}: {:?}",
165                    reader.buffer_position(),
166                    e
167                );
168            }
169            Ok(quick_xml::events::Event::Eof) => break,
170            _ => (),
171        }
172        buf.clear();
173    }
174
175    (dehtml.strbuilder, dehtml.quote)
176}
177
178fn dehtml_text_cb(event: &BytesText, dehtml: &mut Dehtml) {
179    static LINE_RE: LazyLock<regex::Regex> =
180        LazyLock::new(|| regex::Regex::new(r"(\r?\n)+").unwrap());
181
182    if dehtml.get_add_text() == AddText::YesPreserveLineEnds
183        || dehtml.get_add_text() == AddText::YesRemoveLineEnds
184    {
185        let event = event as &[_];
186        let event_str = std::str::from_utf8(event).unwrap_or_default();
187        let mut last_added = escaper::decode_html_buf_sloppy(event).unwrap_or_default();
188        if event_str.starts_with(&last_added) {
189            last_added = event_str.to_string();
190        }
191
192        if dehtml.get_add_text() == AddText::YesRemoveLineEnds {
193            // Replace all line ends with spaces.
194            // E.g. `\r\n\r\n` is replaced with one space.
195            let last_added = LINE_RE.replace_all(&last_added, " ");
196
197            // Add a space if `last_added` starts with a space
198            // and there is no whitespace at the end of the buffer yet.
199            // Trim the rest of leading whitespace from `last_added`.
200            let buf = dehtml.get_buf();
201            if !buf.ends_with(' ') && !buf.ends_with('\n') && last_added.starts_with(' ') {
202                *buf += " ";
203            }
204
205            *buf += last_added.trim_start();
206        } else {
207            *dehtml.get_buf() += LINE_RE.replace_all(&last_added, "\n").as_ref();
208        }
209    }
210}
211
212fn dehtml_endtag_cb(event: &BytesEnd, dehtml: &mut Dehtml) {
213    let tag = String::from_utf8_lossy(event.name().as_ref())
214        .trim()
215        .to_lowercase();
216
217    match tag.as_str() {
218        "style" | "script" | "title" | "pre" => {
219            *dehtml.get_buf() += "\n\n";
220            dehtml.add_text = AddText::YesRemoveLineEnds;
221        }
222        "div" => {
223            pop_tag(&mut dehtml.divs_since_quote_div);
224            pop_tag(&mut dehtml.divs_since_quoted_content_div);
225
226            *dehtml.get_buf() += "\n\n";
227            dehtml.add_text = AddText::YesRemoveLineEnds;
228        }
229        "a" => {
230            if let Some(ref last_href) = dehtml.last_href.take() {
231                let buf = dehtml.get_buf();
232                if buf.ends_with('[') {
233                    buf.truncate(buf.len() - 1);
234                } else {
235                    *buf += "](";
236                    *buf += last_href;
237                    *buf += ")";
238                }
239            }
240        }
241        "b" | "strong" => {
242            if dehtml.get_add_text() != AddText::No {
243                *dehtml.get_buf() += "*";
244            }
245        }
246        "i" | "em" => {
247            if dehtml.get_add_text() != AddText::No {
248                *dehtml.get_buf() += "_";
249            }
250        }
251        "blockquote" => pop_tag(&mut dehtml.blockquotes_since_blockquote),
252        _ => {}
253    }
254}
255
256fn dehtml_starttag_cb<B: std::io::BufRead>(
257    event: &BytesStart,
258    dehtml: &mut Dehtml,
259    reader: &quick_xml::Reader<B>,
260) {
261    let tag = String::from_utf8_lossy(event.name().as_ref())
262        .trim()
263        .to_lowercase();
264
265    match tag.as_str() {
266        "p" | "table" | "td" => {
267            if !dehtml.strbuilder.is_empty() {
268                *dehtml.get_buf() += "\n\n";
269            }
270            dehtml.add_text = AddText::YesRemoveLineEnds;
271        }
272        #[rustfmt::skip]
273        "div" => {
274            maybe_push_tag(event, reader, "quote", &mut dehtml.divs_since_quote_div);
275            maybe_push_tag(event, reader, "quoted-content", &mut dehtml.divs_since_quoted_content_div);
276
277            *dehtml.get_buf() += "\n\n";
278            dehtml.add_text = AddText::YesRemoveLineEnds;
279        }
280        "br" => {
281            *dehtml.get_buf() += "\n";
282            dehtml.add_text = AddText::YesRemoveLineEnds;
283        }
284        "style" | "script" | "title" => {
285            dehtml.add_text = AddText::No;
286        }
287        "pre" => {
288            *dehtml.get_buf() += "\n\n";
289            dehtml.add_text = AddText::YesPreserveLineEnds;
290        }
291        "a" => {
292            if let Some(href) = event
293                .html_attributes()
294                .filter_map(|attr| attr.ok())
295                .find(|attr| {
296                    String::from_utf8_lossy(attr.key.as_ref())
297                        .trim()
298                        .to_lowercase()
299                        == "href"
300                })
301            {
302                let href = href
303                    .decode_and_unescape_value(reader.decoder())
304                    .unwrap_or_default()
305                    .to_string();
306
307                if !href.is_empty() {
308                    dehtml.last_href = Some(href);
309                    *dehtml.get_buf() += "[";
310                }
311            }
312        }
313        "b" | "strong" => {
314            if dehtml.get_add_text() != AddText::No {
315                *dehtml.get_buf() += "*";
316            }
317        }
318        "i" | "em" => {
319            if dehtml.get_add_text() != AddText::No {
320                *dehtml.get_buf() += "_";
321            }
322        }
323        "blockquote" => dehtml.blockquotes_since_blockquote += 1,
324        _ => {}
325    }
326}
327
328/// In order to know when a specific tag is closed, we need to count the opening and closing tags.
329/// The `counts`s are stored in the `Dehtml` struct.
330fn pop_tag(count: &mut u32) {
331    if *count > 0 {
332        *count -= 1;
333    }
334}
335
336/// In order to know when a specific tag is closed, we need to count the opening and closing tags.
337/// The `counts`s are stored in the `Dehtml` struct.
338fn maybe_push_tag(
339    event: &BytesStart,
340    reader: &Reader<impl BufRead>,
341    tag_name: &str,
342    count: &mut u32,
343) {
344    if *count > 0 || tag_contains_attr(event, reader, tag_name) {
345        *count += 1;
346    }
347}
348
349fn tag_contains_attr(event: &BytesStart, reader: &Reader<impl BufRead>, name: &str) -> bool {
350    event.attributes().any(|r| {
351        r.map(|a| {
352            a.decode_and_unescape_value(reader.decoder())
353                .map(|v| v == name)
354                .unwrap_or(false)
355        })
356        .unwrap_or(false)
357    })
358}
359
360pub fn dehtml_manually(buf: &str) -> String {
361    // Just strip out everything between "<" and ">"
362    let mut strbuilder = String::new();
363    let mut show_next_chars = true;
364    for c in buf.chars() {
365        match c {
366            '<' => show_next_chars = false,
367            '>' => show_next_chars = true,
368            _ => {
369                if show_next_chars {
370                    strbuilder.push(c)
371                }
372            }
373        }
374    }
375    strbuilder
376}
377
378#[cfg(test)]
379mod tests {
380    use super::*;
381
382    #[test]
383    fn test_dehtml() {
384        let cases = vec![
385            (
386                "<a href='https://example.com'> Foo </a>",
387                "[ Foo ](https://example.com)",
388            ),
389            ("<b> bar </b>", "* bar *"),
390            ("<i>foo</i>", "_foo_"),
391            ("<b> bar <i> foo", "* bar _ foo"),
392            ("&amp; bar", "& bar"),
393            // Despite missing ', this should be shown:
394            ("<a href='/foo.png>Hi</a> ", "Hi"),
395            ("No link: <a href='https://get.delta.chat/'/>", "No link:"),
396            (
397                "No link: <a href='https://get.delta.chat/'></a>",
398                "No link:",
399            ),
400            ("<!doctype html>\n<b>fat text</b>", "*fat text*"),
401            // Invalid html (at least DC should show the text if the html is invalid):
402            ("<!some invalid html code>\n<b>some text</b>", "some text"),
403        ];
404        for (input, output) in cases {
405            assert_eq!(dehtml(input).unwrap().text, output);
406        }
407        let none_cases = vec!["<html> </html>", ""];
408        for input in none_cases {
409            assert_eq!(dehtml(input), None);
410        }
411    }
412
413    #[test]
414    fn test_dehtml_parse_br() {
415        let html = "line1<br>line2";
416        let plain = dehtml(html).unwrap().text;
417        assert_eq!(plain, "line1\nline2");
418
419        let html = "line1<br> line2";
420        let plain = dehtml(html).unwrap().text;
421        assert_eq!(plain, "line1\nline2");
422
423        let html = "line1  <br><br> line2";
424        let plain = dehtml(html).unwrap().text;
425        assert_eq!(plain, "line1\n\nline2");
426
427        let html = "\r\r\nline1<br>\r\n\r\n\r\rline2<br/>line3\n\r";
428        let plain = dehtml(html).unwrap().text;
429        assert_eq!(plain, "line1\nline2\nline3");
430    }
431
432    #[test]
433    fn test_dehtml_parse_span() {
434        assert_eq!(dehtml("<span>Foo</span>bar").unwrap().text, "Foobar");
435        assert_eq!(dehtml("<span>Foo</span> bar").unwrap().text, "Foo bar");
436        assert_eq!(dehtml("<span>Foo </span>bar").unwrap().text, "Foo bar");
437        assert_eq!(dehtml("<span>Foo</span>\nbar").unwrap().text, "Foo bar");
438        assert_eq!(dehtml("\n<span>Foo</span> bar").unwrap().text, "Foo bar");
439        assert_eq!(dehtml("<span>Foo</span>\n\nbar").unwrap().text, "Foo bar");
440        assert_eq!(dehtml("Foo\n<span>bar</span>").unwrap().text, "Foo bar");
441        assert_eq!(dehtml("Foo<span>\nbar</span>").unwrap().text, "Foo bar");
442    }
443
444    #[test]
445    fn test_dehtml_parse_p() {
446        let html = "<p>Foo</p><p>Bar</p>";
447        let plain = dehtml(html).unwrap().text;
448        assert_eq!(plain, "Foo\n\nBar");
449
450        let html = "<p>Foo<p>Bar";
451        let plain = dehtml(html).unwrap().text;
452        assert_eq!(plain, "Foo\n\nBar");
453
454        let html = "<p>Foo</p><p>Bar<p>Baz";
455        let plain = dehtml(html).unwrap().text;
456        assert_eq!(plain, "Foo\n\nBar\n\nBaz");
457    }
458
459    #[test]
460    fn test_dehtml_parse_href() {
461        let html = "<a href=url>text</a>";
462        let plain = dehtml(html).unwrap().text;
463
464        assert_eq!(plain, "[text](url)");
465    }
466
467    #[test]
468    fn test_dehtml_case_sensitive_link() {
469        let html = "<html><A HrEf=\"https://foo.bar/Data\">case in URLs matter</A></html>";
470        let plain = dehtml(html).unwrap().text;
471        assert_eq!(plain, "[case in URLs matter](https://foo.bar/Data)");
472    }
473
474    #[test]
475    fn test_dehtml_bold_text() {
476        let html = "<!DOCTYPE name [<!DOCTYPE ...>]><!-- comment -->text <b><?php echo ... ?>bold</b><![CDATA[<>]]>";
477        let plain = dehtml(html).unwrap().text;
478
479        assert_eq!(plain, "text *bold*<>");
480    }
481
482    #[test]
483    fn test_dehtml_html_encoded() {
484        let html =
485                "&lt;&gt;&quot;&apos;&amp; &auml;&Auml;&ouml;&Ouml;&uuml;&Uuml;&szlig; foo&AElig;&ccedil;&Ccedil; &diams;&lrm;&rlm;&zwnj;&noent;&zwj;";
486
487        let plain = dehtml(html).unwrap().text;
488
489        assert_eq!(
490            plain,
491            "<>\"\'& äÄöÖüÜß fooÆçÇ \u{2666}\u{200e}\u{200f}\u{200c}&noent;\u{200d}"
492        );
493    }
494
495    #[test]
496    fn test_unclosed_tags() {
497        let input = r##"
498        <!DOCTYPE HTML PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN'
499        'http://www.w3.org/TR/html4/loose.dtd'>
500        <html>
501        <head>
502        <title>Hi</title>
503        <meta http-equiv='Content-Type' content='text/html; charset=iso-8859-1'>						
504        </head>
505        <body>
506        lots of text
507        </body>
508        </html>
509        "##;
510        let txt = dehtml(input).unwrap();
511        assert_eq!(txt.text.trim(), "lots of text");
512    }
513
514    #[test]
515    fn test_pre_tag() {
516        let input = "<html><pre>\ntwo\nlines\n</pre></html>";
517        let txt = dehtml(input).unwrap();
518        assert_eq!(txt.text.trim(), "two\nlines");
519    }
520
521    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
522    async fn test_quote_div() {
523        let input = include_str!("../test-data/message/gmx-quote-body.eml");
524        let dehtml = dehtml(input).unwrap();
525        let SimplifiedText {
526            text,
527            is_forwarded,
528            is_cut,
529            top_quote,
530            footer,
531        } = dehtml;
532        assert_eq!(text, "Test");
533        assert_eq!(is_forwarded, false);
534        assert_eq!(is_cut, false);
535        assert_eq!(top_quote.as_deref(), Some("test"));
536        assert_eq!(footer, None);
537    }
538
539    #[test]
540    fn test_spaces() {
541        let input = include_str!("../test-data/spaces.html");
542        let txt = dehtml(input).unwrap();
543        assert_eq!(txt.text, "Welcome back to Strolling!\n\nHey there,\n\nWelcome back! Use this link to securely sign in to your Strolling account:\n\nSign in to Strolling\n\nFor your security, the link will expire in 24 hours time.\n\nSee you soon!\n\nYou can also copy & paste this URL into your browser:\n\nhttps://strolling.rosano.ca/members/?token=XXX&action=signin&r=https%3A%2F%2Fstrolling.rosano.ca%2F\n\nIf you did not make this request, you can safely ignore this email.\n\nThis message was sent from [strolling.rosano.ca](https://strolling.rosano.ca/) to [alice@example.org](mailto:alice@example.org)");
544    }
545}