1use std::io::BufRead;
6use std::sync::LazyLock;
7
8use quick_xml::{
9 Reader,
10 errors::Error as QuickXmlError,
11 events::{BytesEnd, BytesStart, BytesText},
12};
13
14use crate::simplify::{SimplifiedText, simplify_quote};
15
16#[derive(Default)]
17struct Dehtml {
18 strbuilder: String,
19 quote: String,
20 add_text: AddText,
21 last_href: Option<String>,
22 divs_since_quote_div: u32,
26 divs_since_quoted_content_div: u32,
29 divs_since_hp_legacy_display: u32,
32 blockquotes_since_blockquote: u32,
35}
36
37impl Dehtml {
38 fn is_quote(&self) -> bool {
40 self.divs_since_quoted_content_div > 0 || self.blockquotes_since_blockquote > 0
41 }
42
43 fn get_buf(&mut self) -> &mut String {
47 if self.is_quote() {
48 &mut self.quote
49 } else {
50 &mut self.strbuilder
51 }
52 }
53
54 fn get_add_text(&self) -> AddText {
55 if self.divs_since_quote_div > 0 && self.divs_since_quoted_content_div == 0
58 || self.divs_since_hp_legacy_display > 0
59 {
60 AddText::No
61 } else {
62 self.add_text
63 }
64 }
65}
66
67#[derive(Debug, Default, PartialEq, Clone, Copy)]
68enum AddText {
69 No,
72
73 #[default]
74 YesRemoveLineEnds,
75
76 YesPreserveLineEnds,
78}
79
80pub(crate) fn dehtml(buf: &str) -> Option<SimplifiedText> {
81 let (s, quote) = dehtml_quick_xml(buf);
82 if !s.trim().is_empty() {
83 let text = dehtml_cleanup(s);
84 let top_quote = if !quote.trim().is_empty() {
85 Some(dehtml_cleanup(simplify_quote("e).0))
86 } else {
87 None
88 };
89 return Some(SimplifiedText {
90 text,
91 top_quote,
92 ..Default::default()
93 });
94 }
95 let s = dehtml_manually(buf);
96 if !s.trim().is_empty() {
97 let text = dehtml_cleanup(s);
98 return Some(SimplifiedText {
99 text,
100 ..Default::default()
101 });
102 }
103 None
104}
105
106fn dehtml_cleanup(mut text: String) -> String {
107 text.retain(|c| c != '\r');
108 let lines = text.trim().split('\n');
109 let mut text = String::new();
110 let mut linebreak = false;
111 for line in lines {
112 if line.chars().all(char::is_whitespace) {
113 linebreak = true;
114 } else {
115 if !text.is_empty() {
116 text += "\n";
117 if linebreak {
118 text += "\n";
119 }
120 }
121 text += line.trim_end();
122 linebreak = false;
123 }
124 }
125 text
126}
127
128fn dehtml_quick_xml(buf: &str) -> (String, String) {
129 let buf = buf.trim().trim_start_matches("<!doctype html>");
130
131 let mut dehtml = Dehtml {
132 strbuilder: String::with_capacity(buf.len()),
133 ..Default::default()
134 };
135
136 let mut reader = quick_xml::Reader::from_str(buf);
137 reader.config_mut().check_end_names = false;
138
139 let mut buf = Vec::new();
140 let mut char_buf = String::with_capacity(4);
141
142 loop {
143 match reader.read_event_into(&mut buf) {
144 Ok(quick_xml::events::Event::Start(ref e)) => {
145 dehtml_starttag_cb(e, &mut dehtml, &reader)
146 }
147 Ok(quick_xml::events::Event::End(ref e)) => dehtml_endtag_cb(e, &mut dehtml),
148 Ok(quick_xml::events::Event::Text(ref e)) => dehtml_text_cb(e, &mut dehtml),
149 Ok(quick_xml::events::Event::CData(e)) => {
150 str_cb(&String::from_utf8_lossy(&e as &[_]), &mut dehtml)
151 }
152 Ok(quick_xml::events::Event::Empty(ref e)) => {
153 dehtml_starttag_cb(e, &mut dehtml, &reader);
156 dehtml_endtag_cb(
157 &BytesEnd::new(String::from_utf8_lossy(e.name().as_ref())),
158 &mut dehtml,
159 );
160 }
161 Ok(quick_xml::events::Event::GeneralRef(ref e)) => {
162 match e.resolve_char_ref() {
163 Err(err) => eprintln!(
164 "resolve_char_ref() error at position {}: {:?}",
165 reader.buffer_position(),
166 err,
167 ),
168 Ok(Some(ch)) => {
169 char_buf.clear();
170 char_buf.push(ch);
171 str_cb(&char_buf, &mut dehtml);
172 }
173 Ok(None) => {
174 let event_str = String::from_utf8_lossy(e);
175 if let Some(s) = quick_xml::escape::resolve_html5_entity(&event_str) {
176 str_cb(s, &mut dehtml);
177 } else {
178 str_cb(&format!("&{event_str};"), &mut dehtml);
180 }
181 }
182 }
183 }
184 Err(QuickXmlError::IllFormed(_)) => {
185 str_cb(&String::from_utf8_lossy(&buf), &mut dehtml);
187 }
188 Err(e) => {
189 eprintln!(
190 "Parse html error: Error at position {}: {:?}",
191 reader.buffer_position(),
192 e
193 );
194 }
195 Ok(quick_xml::events::Event::Eof) => break,
196 _ => (),
197 }
198 buf.clear();
199 }
200
201 (dehtml.strbuilder, dehtml.quote)
202}
203
204fn dehtml_text_cb(event: &BytesText, dehtml: &mut Dehtml) {
205 if dehtml.get_add_text() == AddText::YesPreserveLineEnds
206 || dehtml.get_add_text() == AddText::YesRemoveLineEnds
207 {
208 let event = event as &[_];
209 let event_str = std::str::from_utf8(event).unwrap_or_default();
210 str_cb(event_str, dehtml);
211 }
212}
213
214fn str_cb(event_str: &str, dehtml: &mut Dehtml) {
215 static LINE_RE: LazyLock<regex::Regex> =
216 LazyLock::new(|| regex::Regex::new(r"(\r?\n)+").unwrap());
217
218 let add_text = dehtml.get_add_text();
219 if add_text == AddText::YesRemoveLineEnds {
220 let event_str = LINE_RE.replace_all(event_str, " ");
223
224 let buf = dehtml.get_buf();
228 if !buf.ends_with(' ') && !buf.ends_with('\n') && event_str.starts_with(' ') {
229 *buf += " ";
230 }
231
232 *buf += event_str.trim_start();
233 } else if add_text == AddText::YesPreserveLineEnds {
234 *dehtml.get_buf() += LINE_RE.replace_all(event_str, "\n").as_ref();
235 }
236}
237
238#[expect(clippy::arithmetic_side_effects)]
239fn dehtml_endtag_cb(event: &BytesEnd, dehtml: &mut Dehtml) {
240 let tag = String::from_utf8_lossy(event.name().as_ref())
241 .trim()
242 .to_lowercase();
243
244 match tag.as_str() {
245 "style" | "script" | "title" | "pre" => {
246 *dehtml.get_buf() += "\n\n";
247 dehtml.add_text = AddText::YesRemoveLineEnds;
248 }
249 "div" => {
250 pop_tag(&mut dehtml.divs_since_quote_div);
251 pop_tag(&mut dehtml.divs_since_quoted_content_div);
252 pop_tag(&mut dehtml.divs_since_hp_legacy_display);
253
254 *dehtml.get_buf() += "\n\n";
255 dehtml.add_text = AddText::YesRemoveLineEnds;
256 }
257 "a" => {
258 if let Some(ref last_href) = dehtml.last_href.take() {
259 let buf = dehtml.get_buf();
260 if buf.ends_with('[') {
261 buf.truncate(buf.len() - 1);
262 } else {
263 *buf += "](";
264 *buf += last_href;
265 *buf += ")";
266 }
267 }
268 }
269 "b" | "strong" if dehtml.get_add_text() != AddText::No => {
270 *dehtml.get_buf() += "*";
271 }
272 "i" | "em" if dehtml.get_add_text() != AddText::No => {
273 *dehtml.get_buf() += "_";
274 }
275 "blockquote" => pop_tag(&mut dehtml.blockquotes_since_blockquote),
276 _ => {}
277 }
278}
279
280#[expect(clippy::arithmetic_side_effects)]
281fn dehtml_starttag_cb<B: std::io::BufRead>(
282 event: &BytesStart,
283 dehtml: &mut Dehtml,
284 reader: &quick_xml::Reader<B>,
285) {
286 let tag = String::from_utf8_lossy(event.name().as_ref())
287 .trim()
288 .to_lowercase();
289
290 match tag.as_str() {
291 "p" | "table" | "td" => {
292 if !dehtml.strbuilder.is_empty() {
293 *dehtml.get_buf() += "\n\n";
294 }
295 dehtml.add_text = AddText::YesRemoveLineEnds;
296 }
297 #[rustfmt::skip]
298 "div" => {
299 maybe_push_tag(event, reader, "quote", &mut dehtml.divs_since_quote_div);
300 maybe_push_tag(event, reader, "quoted-content", &mut dehtml.divs_since_quoted_content_div);
301 maybe_push_tag(event, reader, "header-protection-legacy-display",
302 &mut dehtml.divs_since_hp_legacy_display);
303
304 *dehtml.get_buf() += "\n\n";
305 dehtml.add_text = AddText::YesRemoveLineEnds;
306 }
307 "br" => {
308 *dehtml.get_buf() += "\n";
309 dehtml.add_text = AddText::YesRemoveLineEnds;
310 }
311 "style" | "script" | "title" => {
312 dehtml.add_text = AddText::No;
313 }
314 "pre" => {
315 *dehtml.get_buf() += "\n\n";
316 dehtml.add_text = AddText::YesPreserveLineEnds;
317 }
318 "a" => {
319 if let Some(href) = event
320 .html_attributes()
321 .filter_map(|attr| attr.ok())
322 .find(|attr| {
323 String::from_utf8_lossy(attr.key.as_ref())
324 .trim()
325 .to_lowercase()
326 == "href"
327 })
328 {
329 let href = href
330 .decode_and_unescape_value(reader.decoder())
331 .unwrap_or_default()
332 .to_string();
333
334 if !href.is_empty() {
335 dehtml.last_href = Some(href);
336 *dehtml.get_buf() += "[";
337 }
338 }
339 }
340 "b" | "strong" if dehtml.get_add_text() != AddText::No => {
341 *dehtml.get_buf() += "*";
342 }
343 "i" | "em" if dehtml.get_add_text() != AddText::No => {
344 *dehtml.get_buf() += "_";
345 }
346 "blockquote" => dehtml.blockquotes_since_blockquote += 1,
347 _ => {}
348 }
349}
350
351#[expect(clippy::arithmetic_side_effects)]
354fn pop_tag(count: &mut u32) {
355 if *count > 0 {
356 *count -= 1;
357 }
358}
359
360#[expect(clippy::arithmetic_side_effects)]
363fn maybe_push_tag(
364 event: &BytesStart,
365 reader: &Reader<impl BufRead>,
366 tag_name: &str,
367 count: &mut u32,
368) {
369 if *count > 0 || tag_contains_attr(event, reader, tag_name) {
370 *count += 1;
371 }
372}
373
374fn tag_contains_attr(event: &BytesStart, reader: &Reader<impl BufRead>, name: &str) -> bool {
375 event.attributes().any(|r| {
376 r.map(|a| {
377 a.decode_and_unescape_value(reader.decoder())
378 .map(|v| v == name)
379 .unwrap_or(false)
380 })
381 .unwrap_or(false)
382 })
383}
384
385pub fn dehtml_manually(buf: &str) -> String {
386 let mut strbuilder = String::new();
388 let mut show_next_chars = true;
389 for c in buf.chars() {
390 match c {
391 '<' => show_next_chars = false,
392 '>' => show_next_chars = true,
393 _ => {
394 if show_next_chars {
395 strbuilder.push(c)
396 }
397 }
398 }
399 }
400 strbuilder
401}
402
403#[cfg(test)]
404mod tests {
405 use super::*;
406
407 #[test]
408 fn test_dehtml() {
409 let cases = vec",
413 ),
414 ("<b> bar </b>", "* bar *"),
415 ("<i>foo</i>", "_foo_"),
416 ("<b> bar <i> foo", "* bar _ foo"),
417 ("& bar", "& bar"),
418 ("<a href='/foo.png>Hi</a> ", "Hi"),
420 ("No link: <a href='https://get.delta.chat/'/>", "No link:"),
421 (
422 "No link: <a href='https://get.delta.chat/'></a>",
423 "No link:",
424 ),
425 ("<!doctype html>\n<b>fat text</b>", "*fat text*"),
426 ("<!some invalid html code>\n<b>some text</b>", "some text"),
428 ];
429 for (input, output) in cases {
430 assert_eq!(dehtml(input).unwrap().text, output);
431 }
432 let none_cases = vec!["<html> </html>", ""];
433 for input in none_cases {
434 assert_eq!(dehtml(input), None);
435 }
436 }
437
438 #[test]
439 fn test_dehtml_parse_br() {
440 let html = "line1<br>line2";
441 let plain = dehtml(html).unwrap().text;
442 assert_eq!(plain, "line1\nline2");
443
444 let html = "line1<br> line2";
445 let plain = dehtml(html).unwrap().text;
446 assert_eq!(plain, "line1\nline2");
447
448 let html = "line1 <br><br> line2";
449 let plain = dehtml(html).unwrap().text;
450 assert_eq!(plain, "line1\n\nline2");
451
452 let html = "\r\r\nline1<br>\r\n\r\n\r\rline2<br/>line3\n\r";
453 let plain = dehtml(html).unwrap().text;
454 assert_eq!(plain, "line1\nline2\nline3");
455 }
456
457 #[test]
458 fn test_dehtml_parse_span() {
459 assert_eq!(dehtml("<span>Foo</span>bar").unwrap().text, "Foobar");
460 assert_eq!(dehtml("<span>Foo</span> bar").unwrap().text, "Foo bar");
461 assert_eq!(dehtml("<span>Foo </span>bar").unwrap().text, "Foo bar");
462 assert_eq!(dehtml("<span>Foo</span>\nbar").unwrap().text, "Foo bar");
463 assert_eq!(dehtml("\n<span>Foo</span> bar").unwrap().text, "Foo bar");
464 assert_eq!(dehtml("<span>Foo</span>\n\nbar").unwrap().text, "Foo bar");
465 assert_eq!(dehtml("Foo\n<span>bar</span>").unwrap().text, "Foo bar");
466 assert_eq!(dehtml("Foo<span>\nbar</span>").unwrap().text, "Foo bar");
467 }
468
469 #[test]
470 fn test_dehtml_parse_p() {
471 let html = "<p>Foo</p><p>Bar</p>";
472 let plain = dehtml(html).unwrap().text;
473 assert_eq!(plain, "Foo\n\nBar");
474
475 let html = "<p>Foo<p>Bar";
476 let plain = dehtml(html).unwrap().text;
477 assert_eq!(plain, "Foo\n\nBar");
478
479 let html = "<p>Foo</p><p>Bar<p>Baz";
480 let plain = dehtml(html).unwrap().text;
481 assert_eq!(plain, "Foo\n\nBar\n\nBaz");
482 }
483
484 #[test]
485 fn test_dehtml_parse_href() {
486 let html = "<a href=url>text</a>";
487 let plain = dehtml(html).unwrap().text;
488
489 assert_eq!(plain, "[text](url)");
490 }
491
492 #[test]
493 fn test_dehtml_case_sensitive_link() {
494 let html = "<html><A HrEf=\"https://foo.bar/Data\">case in URLs matter</A></html>";
495 let plain = dehtml(html).unwrap().text;
496 assert_eq!(plain, "[case in URLs matter](https://foo.bar/Data)");
497 }
498
499 #[test]
500 fn test_dehtml_bold_text() {
501 let html = "<!DOCTYPE name [<!DOCTYPE ...>]><!-- comment -->text <b><?php echo ... ?>bold</b><![CDATA[<>]]>";
502 let plain = dehtml(html).unwrap().text;
503
504 assert_eq!(plain, "text *bold*<>");
505 }
506
507 #[test]
508 fn test_dehtml_html_encoded() {
509 let html = "<>"'& äÄöÖüÜß fooÆçÇ ♦‎‏‌&noent;‍";
510
511 let plain = dehtml(html).unwrap().text;
512
513 assert_eq!(
514 plain,
515 "<>\"\'& äÄöÖüÜß fooÆçÇ \u{2666}\u{200e}\u{200f}\u{200c}&noent;\u{200d}"
516 );
517 }
518
519 #[test]
520 fn test_unclosed_tags() {
521 let input = r##"
522 <!DOCTYPE HTML PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN'
523 'http://www.w3.org/TR/html4/loose.dtd'>
524 <html>
525 <head>
526 <title>Hi</title>
527 <meta http-equiv='Content-Type' content='text/html; charset=iso-8859-1'>
528 </head>
529 <body>
530 lots of text
531 </body>
532 </html>
533 "##;
534 let txt = dehtml(input).unwrap();
535 assert_eq!(txt.text.trim(), "lots of text");
536 }
537
538 #[test]
539 fn test_pre_tag() {
540 let input = "<html><pre>\ntwo\nlines\n</pre></html>";
541 let txt = dehtml(input).unwrap();
542 assert_eq!(txt.text.trim(), "two\nlines");
543 }
544
545 #[test]
546 fn test_hp_legacy_display() {
547 let input = r#"
548<html><head><title></title></head><body>
549<div class="header-protection-legacy-display">
550<pre>Subject: Dinner plans</pre>
551</div>
552<p>
553Let's meet at Rama's Roti Shop at 8pm and go to the park
554from there.
555</p>
556</body>
557</html>
558 "#;
559 let txt = dehtml(input).unwrap();
560 assert_eq!(
561 txt.text.trim(),
562 "Let's meet at Rama's Roti Shop at 8pm and go to the park from there."
563 );
564 }
565
566 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
567 async fn test_quote_div() {
568 let input = include_str!("../test-data/message/gmx-quote-body.eml");
569 let dehtml = dehtml(input).unwrap();
570 let SimplifiedText {
571 text,
572 is_forwarded,
573 is_cut,
574 top_quote,
575 footer,
576 } = dehtml;
577 assert_eq!(text, "Test");
578 assert_eq!(is_forwarded, false);
579 assert_eq!(is_cut, false);
580 assert_eq!(top_quote.as_deref(), Some("test"));
581 assert_eq!(footer, None);
582 }
583
584 #[test]
585 fn test_spaces() {
586 let input = include_str!("../test-data/spaces.html");
587 let txt = dehtml(input).unwrap();
588 assert_eq!(
589 txt.text,
590 "Welcome back to Strolling!\n\nHey there,\n\nWelcome back! Use this link to securely sign in to your Strolling account:\n\nSign in to Strolling\n\nFor your security, the link will expire in 24 hours time.\n\nSee you soon!\n\nYou can also copy & paste this URL into your browser:\n\nhttps://strolling.rosano.ca/members/?token=XXX&action=signin&r=https%3A%2F%2Fstrolling.rosano.ca%2F\n\nIf you did not make this request, you can safely ignore this email.\n\nThis message was sent from [strolling.rosano.ca](https://strolling.rosano.ca/) to [alice@example.org](mailto:alice@example.org)"
591 );
592 }
593}