deltachat/
dehtml.rs

1//! De-HTML.
2//!
3//! A module to remove HTML tags from the email text
4
5use std::io::BufRead;
6use std::sync::LazyLock;
7
8use quick_xml::{
9    Reader,
10    errors::Error as QuickXmlError,
11    events::{BytesEnd, BytesStart, BytesText},
12};
13
14use crate::simplify::{SimplifiedText, simplify_quote};
15
16struct Dehtml {
17    strbuilder: String,
18    quote: String,
19    add_text: AddText,
20    last_href: Option<String>,
21    /// GMX wraps a quote in `<div name="quote">`. After a `<div name="quote">`, this count is
22    /// increased at each `<div>` and decreased at each `</div>`. This way we know when the quote ends.
23    /// If this is > `0`, then we are inside a `<div name="quote">`
24    divs_since_quote_div: u32,
25    /// Everything between `<div name="quote">` and `<div name="quoted-content">` is usually metadata
26    /// If this is > `0`, then we are inside a `<div name="quoted-content">`.
27    divs_since_quoted_content_div: u32,
28    /// All-Inkl just puts the quote into `<blockquote> </blockquote>`. This count is
29    /// increased at each `<blockquote>` and decreased at each `</blockquote>`.
30    blockquotes_since_blockquote: u32,
31}
32
33impl Dehtml {
34    /// Returns true if HTML parser is currently inside the quote.
35    fn is_quote(&self) -> bool {
36        self.divs_since_quoted_content_div > 0 || self.blockquotes_since_blockquote > 0
37    }
38
39    /// Returns the buffer where the text should be written.
40    ///
41    /// If the parser is inside the quote, returns the quote buffer.
42    fn get_buf(&mut self) -> &mut String {
43        if self.is_quote() {
44            &mut self.quote
45        } else {
46            &mut self.strbuilder
47        }
48    }
49
50    fn get_add_text(&self) -> AddText {
51        if self.divs_since_quote_div > 0 && self.divs_since_quoted_content_div == 0 {
52            AddText::No // Everything between `<div name="quoted">` and `<div name="quoted_content">` is metadata which we don't want
53        } else {
54            self.add_text
55        }
56    }
57}
58
59#[derive(Debug, PartialEq, Clone, Copy)]
60enum AddText {
61    /// Inside `<script>`, `<style>` and similar tags
62    /// which contents should not be displayed.
63    No,
64
65    YesRemoveLineEnds,
66
67    /// Inside `<pre>`.
68    YesPreserveLineEnds,
69}
70
71pub(crate) fn dehtml(buf: &str) -> Option<SimplifiedText> {
72    let (s, quote) = dehtml_quick_xml(buf);
73    if !s.trim().is_empty() {
74        let text = dehtml_cleanup(s);
75        let top_quote = if !quote.trim().is_empty() {
76            Some(dehtml_cleanup(simplify_quote(&quote).0))
77        } else {
78            None
79        };
80        return Some(SimplifiedText {
81            text,
82            top_quote,
83            ..Default::default()
84        });
85    }
86    let s = dehtml_manually(buf);
87    if !s.trim().is_empty() {
88        let text = dehtml_cleanup(s);
89        return Some(SimplifiedText {
90            text,
91            ..Default::default()
92        });
93    }
94    None
95}
96
97fn dehtml_cleanup(mut text: String) -> String {
98    text.retain(|c| c != '\r');
99    let lines = text.trim().split('\n');
100    let mut text = String::new();
101    let mut linebreak = false;
102    for line in lines {
103        if line.chars().all(char::is_whitespace) {
104            linebreak = true;
105        } else {
106            if !text.is_empty() {
107                text += "\n";
108                if linebreak {
109                    text += "\n";
110                }
111            }
112            text += line.trim_end();
113            linebreak = false;
114        }
115    }
116    text
117}
118
119fn dehtml_quick_xml(buf: &str) -> (String, String) {
120    let buf = buf.trim().trim_start_matches("<!doctype html>");
121
122    let mut dehtml = Dehtml {
123        strbuilder: String::with_capacity(buf.len()),
124        quote: String::new(),
125        add_text: AddText::YesRemoveLineEnds,
126        last_href: None,
127        divs_since_quote_div: 0,
128        divs_since_quoted_content_div: 0,
129        blockquotes_since_blockquote: 0,
130    };
131
132    let mut reader = quick_xml::Reader::from_str(buf);
133    reader.config_mut().check_end_names = false;
134
135    let mut buf = Vec::new();
136    let mut char_buf = String::with_capacity(4);
137
138    loop {
139        match reader.read_event_into(&mut buf) {
140            Ok(quick_xml::events::Event::Start(ref e)) => {
141                dehtml_starttag_cb(e, &mut dehtml, &reader)
142            }
143            Ok(quick_xml::events::Event::End(ref e)) => dehtml_endtag_cb(e, &mut dehtml),
144            Ok(quick_xml::events::Event::Text(ref e)) => dehtml_text_cb(e, &mut dehtml),
145            Ok(quick_xml::events::Event::CData(e)) => {
146                str_cb(&String::from_utf8_lossy(&e as &[_]), &mut dehtml)
147            }
148            Ok(quick_xml::events::Event::Empty(ref e)) => {
149                // Handle empty tags as a start tag immediately followed by end tag.
150                // For example, `<p/>` is treated as `<p></p>`.
151                dehtml_starttag_cb(e, &mut dehtml, &reader);
152                dehtml_endtag_cb(
153                    &BytesEnd::new(String::from_utf8_lossy(e.name().as_ref())),
154                    &mut dehtml,
155                );
156            }
157            Ok(quick_xml::events::Event::GeneralRef(ref e)) => {
158                match e.resolve_char_ref() {
159                    Err(err) => eprintln!(
160                        "resolve_char_ref() error at position {}: {:?}",
161                        reader.buffer_position(),
162                        err,
163                    ),
164                    Ok(Some(ch)) => {
165                        char_buf.clear();
166                        char_buf.push(ch);
167                        str_cb(&char_buf, &mut dehtml);
168                    }
169                    Ok(None) => {
170                        let event_str = String::from_utf8_lossy(e);
171                        if let Some(s) = quick_xml::escape::resolve_html5_entity(&event_str) {
172                            str_cb(s, &mut dehtml);
173                        } else {
174                            // Nonstandard entity. Add escaped.
175                            str_cb(&format!("&{event_str};"), &mut dehtml);
176                        }
177                    }
178                }
179            }
180            Err(QuickXmlError::IllFormed(_)) => {
181                // This is probably not HTML at all and should be left as is.
182                str_cb(&String::from_utf8_lossy(&buf), &mut dehtml);
183            }
184            Err(e) => {
185                eprintln!(
186                    "Parse html error: Error at position {}: {:?}",
187                    reader.buffer_position(),
188                    e
189                );
190            }
191            Ok(quick_xml::events::Event::Eof) => break,
192            _ => (),
193        }
194        buf.clear();
195    }
196
197    (dehtml.strbuilder, dehtml.quote)
198}
199
200fn dehtml_text_cb(event: &BytesText, dehtml: &mut Dehtml) {
201    if dehtml.get_add_text() == AddText::YesPreserveLineEnds
202        || dehtml.get_add_text() == AddText::YesRemoveLineEnds
203    {
204        let event = event as &[_];
205        let event_str = std::str::from_utf8(event).unwrap_or_default();
206        str_cb(event_str, dehtml);
207    }
208}
209
210fn str_cb(event_str: &str, dehtml: &mut Dehtml) {
211    static LINE_RE: LazyLock<regex::Regex> =
212        LazyLock::new(|| regex::Regex::new(r"(\r?\n)+").unwrap());
213
214    let add_text = dehtml.get_add_text();
215    if add_text == AddText::YesRemoveLineEnds {
216        // Replace all line ends with spaces.
217        // E.g. `\r\n\r\n` is replaced with one space.
218        let event_str = LINE_RE.replace_all(event_str, " ");
219
220        // Add a space if `event_str` starts with a space
221        // and there is no whitespace at the end of the buffer yet.
222        // Trim the rest of leading whitespace from `event_str`.
223        let buf = dehtml.get_buf();
224        if !buf.ends_with(' ') && !buf.ends_with('\n') && event_str.starts_with(' ') {
225            *buf += " ";
226        }
227
228        *buf += event_str.trim_start();
229    } else if add_text == AddText::YesPreserveLineEnds {
230        *dehtml.get_buf() += LINE_RE.replace_all(event_str, "\n").as_ref();
231    }
232}
233
234fn dehtml_endtag_cb(event: &BytesEnd, dehtml: &mut Dehtml) {
235    let tag = String::from_utf8_lossy(event.name().as_ref())
236        .trim()
237        .to_lowercase();
238
239    match tag.as_str() {
240        "style" | "script" | "title" | "pre" => {
241            *dehtml.get_buf() += "\n\n";
242            dehtml.add_text = AddText::YesRemoveLineEnds;
243        }
244        "div" => {
245            pop_tag(&mut dehtml.divs_since_quote_div);
246            pop_tag(&mut dehtml.divs_since_quoted_content_div);
247
248            *dehtml.get_buf() += "\n\n";
249            dehtml.add_text = AddText::YesRemoveLineEnds;
250        }
251        "a" => {
252            if let Some(ref last_href) = dehtml.last_href.take() {
253                let buf = dehtml.get_buf();
254                if buf.ends_with('[') {
255                    buf.truncate(buf.len() - 1);
256                } else {
257                    *buf += "](";
258                    *buf += last_href;
259                    *buf += ")";
260                }
261            }
262        }
263        "b" | "strong" => {
264            if dehtml.get_add_text() != AddText::No {
265                *dehtml.get_buf() += "*";
266            }
267        }
268        "i" | "em" => {
269            if dehtml.get_add_text() != AddText::No {
270                *dehtml.get_buf() += "_";
271            }
272        }
273        "blockquote" => pop_tag(&mut dehtml.blockquotes_since_blockquote),
274        _ => {}
275    }
276}
277
278fn dehtml_starttag_cb<B: std::io::BufRead>(
279    event: &BytesStart,
280    dehtml: &mut Dehtml,
281    reader: &quick_xml::Reader<B>,
282) {
283    let tag = String::from_utf8_lossy(event.name().as_ref())
284        .trim()
285        .to_lowercase();
286
287    match tag.as_str() {
288        "p" | "table" | "td" => {
289            if !dehtml.strbuilder.is_empty() {
290                *dehtml.get_buf() += "\n\n";
291            }
292            dehtml.add_text = AddText::YesRemoveLineEnds;
293        }
294        #[rustfmt::skip]
295        "div" => {
296            maybe_push_tag(event, reader, "quote", &mut dehtml.divs_since_quote_div);
297            maybe_push_tag(event, reader, "quoted-content", &mut dehtml.divs_since_quoted_content_div);
298
299            *dehtml.get_buf() += "\n\n";
300            dehtml.add_text = AddText::YesRemoveLineEnds;
301        }
302        "br" => {
303            *dehtml.get_buf() += "\n";
304            dehtml.add_text = AddText::YesRemoveLineEnds;
305        }
306        "style" | "script" | "title" => {
307            dehtml.add_text = AddText::No;
308        }
309        "pre" => {
310            *dehtml.get_buf() += "\n\n";
311            dehtml.add_text = AddText::YesPreserveLineEnds;
312        }
313        "a" => {
314            if let Some(href) = event
315                .html_attributes()
316                .filter_map(|attr| attr.ok())
317                .find(|attr| {
318                    String::from_utf8_lossy(attr.key.as_ref())
319                        .trim()
320                        .to_lowercase()
321                        == "href"
322                })
323            {
324                let href = href
325                    .decode_and_unescape_value(reader.decoder())
326                    .unwrap_or_default()
327                    .to_string();
328
329                if !href.is_empty() {
330                    dehtml.last_href = Some(href);
331                    *dehtml.get_buf() += "[";
332                }
333            }
334        }
335        "b" | "strong" => {
336            if dehtml.get_add_text() != AddText::No {
337                *dehtml.get_buf() += "*";
338            }
339        }
340        "i" | "em" => {
341            if dehtml.get_add_text() != AddText::No {
342                *dehtml.get_buf() += "_";
343            }
344        }
345        "blockquote" => dehtml.blockquotes_since_blockquote += 1,
346        _ => {}
347    }
348}
349
350/// In order to know when a specific tag is closed, we need to count the opening and closing tags.
351/// The `counts`s are stored in the `Dehtml` struct.
352fn pop_tag(count: &mut u32) {
353    if *count > 0 {
354        *count -= 1;
355    }
356}
357
358/// In order to know when a specific tag is closed, we need to count the opening and closing tags.
359/// The `counts`s are stored in the `Dehtml` struct.
360fn maybe_push_tag(
361    event: &BytesStart,
362    reader: &Reader<impl BufRead>,
363    tag_name: &str,
364    count: &mut u32,
365) {
366    if *count > 0 || tag_contains_attr(event, reader, tag_name) {
367        *count += 1;
368    }
369}
370
371fn tag_contains_attr(event: &BytesStart, reader: &Reader<impl BufRead>, name: &str) -> bool {
372    event.attributes().any(|r| {
373        r.map(|a| {
374            a.decode_and_unescape_value(reader.decoder())
375                .map(|v| v == name)
376                .unwrap_or(false)
377        })
378        .unwrap_or(false)
379    })
380}
381
382pub fn dehtml_manually(buf: &str) -> String {
383    // Just strip out everything between "<" and ">"
384    let mut strbuilder = String::new();
385    let mut show_next_chars = true;
386    for c in buf.chars() {
387        match c {
388            '<' => show_next_chars = false,
389            '>' => show_next_chars = true,
390            _ => {
391                if show_next_chars {
392                    strbuilder.push(c)
393                }
394            }
395        }
396    }
397    strbuilder
398}
399
400#[cfg(test)]
401mod tests {
402    use super::*;
403
404    #[test]
405    fn test_dehtml() {
406        let cases = vec![
407            (
408                "<a href='https://example.com'> Foo </a>",
409                "[ Foo ](https://example.com)",
410            ),
411            ("<b> bar </b>", "* bar *"),
412            ("<i>foo</i>", "_foo_"),
413            ("<b> bar <i> foo", "* bar _ foo"),
414            ("&amp; bar", "& bar"),
415            // Despite missing ', this should be shown:
416            ("<a href='/foo.png>Hi</a> ", "Hi"),
417            ("No link: <a href='https://get.delta.chat/'/>", "No link:"),
418            (
419                "No link: <a href='https://get.delta.chat/'></a>",
420                "No link:",
421            ),
422            ("<!doctype html>\n<b>fat text</b>", "*fat text*"),
423            // Invalid html (at least DC should show the text if the html is invalid):
424            ("<!some invalid html code>\n<b>some text</b>", "some text"),
425        ];
426        for (input, output) in cases {
427            assert_eq!(dehtml(input).unwrap().text, output);
428        }
429        let none_cases = vec!["<html> </html>", ""];
430        for input in none_cases {
431            assert_eq!(dehtml(input), None);
432        }
433    }
434
435    #[test]
436    fn test_dehtml_parse_br() {
437        let html = "line1<br>line2";
438        let plain = dehtml(html).unwrap().text;
439        assert_eq!(plain, "line1\nline2");
440
441        let html = "line1<br> line2";
442        let plain = dehtml(html).unwrap().text;
443        assert_eq!(plain, "line1\nline2");
444
445        let html = "line1  <br><br> line2";
446        let plain = dehtml(html).unwrap().text;
447        assert_eq!(plain, "line1\n\nline2");
448
449        let html = "\r\r\nline1<br>\r\n\r\n\r\rline2<br/>line3\n\r";
450        let plain = dehtml(html).unwrap().text;
451        assert_eq!(plain, "line1\nline2\nline3");
452    }
453
454    #[test]
455    fn test_dehtml_parse_span() {
456        assert_eq!(dehtml("<span>Foo</span>bar").unwrap().text, "Foobar");
457        assert_eq!(dehtml("<span>Foo</span> bar").unwrap().text, "Foo bar");
458        assert_eq!(dehtml("<span>Foo </span>bar").unwrap().text, "Foo bar");
459        assert_eq!(dehtml("<span>Foo</span>\nbar").unwrap().text, "Foo bar");
460        assert_eq!(dehtml("\n<span>Foo</span> bar").unwrap().text, "Foo bar");
461        assert_eq!(dehtml("<span>Foo</span>\n\nbar").unwrap().text, "Foo bar");
462        assert_eq!(dehtml("Foo\n<span>bar</span>").unwrap().text, "Foo bar");
463        assert_eq!(dehtml("Foo<span>\nbar</span>").unwrap().text, "Foo bar");
464    }
465
466    #[test]
467    fn test_dehtml_parse_p() {
468        let html = "<p>Foo</p><p>Bar</p>";
469        let plain = dehtml(html).unwrap().text;
470        assert_eq!(plain, "Foo\n\nBar");
471
472        let html = "<p>Foo<p>Bar";
473        let plain = dehtml(html).unwrap().text;
474        assert_eq!(plain, "Foo\n\nBar");
475
476        let html = "<p>Foo</p><p>Bar<p>Baz";
477        let plain = dehtml(html).unwrap().text;
478        assert_eq!(plain, "Foo\n\nBar\n\nBaz");
479    }
480
481    #[test]
482    fn test_dehtml_parse_href() {
483        let html = "<a href=url>text</a>";
484        let plain = dehtml(html).unwrap().text;
485
486        assert_eq!(plain, "[text](url)");
487    }
488
489    #[test]
490    fn test_dehtml_case_sensitive_link() {
491        let html = "<html><A HrEf=\"https://foo.bar/Data\">case in URLs matter</A></html>";
492        let plain = dehtml(html).unwrap().text;
493        assert_eq!(plain, "[case in URLs matter](https://foo.bar/Data)");
494    }
495
496    #[test]
497    fn test_dehtml_bold_text() {
498        let html = "<!DOCTYPE name [<!DOCTYPE ...>]><!-- comment -->text <b><?php echo ... ?>bold</b><![CDATA[<>]]>";
499        let plain = dehtml(html).unwrap().text;
500
501        assert_eq!(plain, "text *bold*<>");
502    }
503
504    #[test]
505    fn test_dehtml_html_encoded() {
506        let html = "&lt;&gt;&quot;&apos;&amp; &auml;&Auml;&ouml;&Ouml;&uuml;&Uuml;&szlig; foo&AElig;&ccedil;&Ccedil; &diams;&lrm;&rlm;&zwnj;&noent;&zwj;";
507
508        let plain = dehtml(html).unwrap().text;
509
510        assert_eq!(
511            plain,
512            "<>\"\'& äÄöÖüÜß fooÆçÇ \u{2666}\u{200e}\u{200f}\u{200c}&noent;\u{200d}"
513        );
514    }
515
516    #[test]
517    fn test_unclosed_tags() {
518        let input = r##"
519        <!DOCTYPE HTML PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN'
520        'http://www.w3.org/TR/html4/loose.dtd'>
521        <html>
522        <head>
523        <title>Hi</title>
524        <meta http-equiv='Content-Type' content='text/html; charset=iso-8859-1'>						
525        </head>
526        <body>
527        lots of text
528        </body>
529        </html>
530        "##;
531        let txt = dehtml(input).unwrap();
532        assert_eq!(txt.text.trim(), "lots of text");
533    }
534
535    #[test]
536    fn test_pre_tag() {
537        let input = "<html><pre>\ntwo\nlines\n</pre></html>";
538        let txt = dehtml(input).unwrap();
539        assert_eq!(txt.text.trim(), "two\nlines");
540    }
541
542    #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
543    async fn test_quote_div() {
544        let input = include_str!("../test-data/message/gmx-quote-body.eml");
545        let dehtml = dehtml(input).unwrap();
546        let SimplifiedText {
547            text,
548            is_forwarded,
549            is_cut,
550            top_quote,
551            footer,
552        } = dehtml;
553        assert_eq!(text, "Test");
554        assert_eq!(is_forwarded, false);
555        assert_eq!(is_cut, false);
556        assert_eq!(top_quote.as_deref(), Some("test"));
557        assert_eq!(footer, None);
558    }
559
560    #[test]
561    fn test_spaces() {
562        let input = include_str!("../test-data/spaces.html");
563        let txt = dehtml(input).unwrap();
564        assert_eq!(
565            txt.text,
566            "Welcome back to Strolling!\n\nHey there,\n\nWelcome back! Use this link to securely sign in to your Strolling account:\n\nSign in to Strolling\n\nFor your security, the link will expire in 24 hours time.\n\nSee you soon!\n\nYou can also copy & paste this URL into your browser:\n\nhttps://strolling.rosano.ca/members/?token=XXX&action=signin&r=https%3A%2F%2Fstrolling.rosano.ca%2F\n\nIf you did not make this request, you can safely ignore this email.\n\nThis message was sent from [strolling.rosano.ca](https://strolling.rosano.ca/) to [alice@example.org](mailto:alice@example.org)"
567        );
568    }
569}