1use std::io::BufRead;
6use std::sync::LazyLock;
7
8use quick_xml::{
9 Reader,
10 errors::Error as QuickXmlError,
11 events::{BytesEnd, BytesStart, BytesText},
12};
13
14use crate::simplify::{SimplifiedText, simplify_quote};
15
16struct Dehtml {
17 strbuilder: String,
18 quote: String,
19 add_text: AddText,
20 last_href: Option<String>,
21 divs_since_quote_div: u32,
25 divs_since_quoted_content_div: u32,
28 blockquotes_since_blockquote: u32,
31}
32
33impl Dehtml {
34 fn is_quote(&self) -> bool {
36 self.divs_since_quoted_content_div > 0 || self.blockquotes_since_blockquote > 0
37 }
38
39 fn get_buf(&mut self) -> &mut String {
43 if self.is_quote() {
44 &mut self.quote
45 } else {
46 &mut self.strbuilder
47 }
48 }
49
50 fn get_add_text(&self) -> AddText {
51 if self.divs_since_quote_div > 0 && self.divs_since_quoted_content_div == 0 {
52 AddText::No } else {
54 self.add_text
55 }
56 }
57}
58
59#[derive(Debug, PartialEq, Clone, Copy)]
60enum AddText {
61 No,
64
65 YesRemoveLineEnds,
66
67 YesPreserveLineEnds,
69}
70
71pub(crate) fn dehtml(buf: &str) -> Option<SimplifiedText> {
72 let (s, quote) = dehtml_quick_xml(buf);
73 if !s.trim().is_empty() {
74 let text = dehtml_cleanup(s);
75 let top_quote = if !quote.trim().is_empty() {
76 Some(dehtml_cleanup(simplify_quote("e).0))
77 } else {
78 None
79 };
80 return Some(SimplifiedText {
81 text,
82 top_quote,
83 ..Default::default()
84 });
85 }
86 let s = dehtml_manually(buf);
87 if !s.trim().is_empty() {
88 let text = dehtml_cleanup(s);
89 return Some(SimplifiedText {
90 text,
91 ..Default::default()
92 });
93 }
94 None
95}
96
97fn dehtml_cleanup(mut text: String) -> String {
98 text.retain(|c| c != '\r');
99 let lines = text.trim().split('\n');
100 let mut text = String::new();
101 let mut linebreak = false;
102 for line in lines {
103 if line.chars().all(char::is_whitespace) {
104 linebreak = true;
105 } else {
106 if !text.is_empty() {
107 text += "\n";
108 if linebreak {
109 text += "\n";
110 }
111 }
112 text += line.trim_end();
113 linebreak = false;
114 }
115 }
116 text
117}
118
119fn dehtml_quick_xml(buf: &str) -> (String, String) {
120 let buf = buf.trim().trim_start_matches("<!doctype html>");
121
122 let mut dehtml = Dehtml {
123 strbuilder: String::with_capacity(buf.len()),
124 quote: String::new(),
125 add_text: AddText::YesRemoveLineEnds,
126 last_href: None,
127 divs_since_quote_div: 0,
128 divs_since_quoted_content_div: 0,
129 blockquotes_since_blockquote: 0,
130 };
131
132 let mut reader = quick_xml::Reader::from_str(buf);
133 reader.config_mut().check_end_names = false;
134
135 let mut buf = Vec::new();
136 let mut char_buf = String::with_capacity(4);
137
138 loop {
139 match reader.read_event_into(&mut buf) {
140 Ok(quick_xml::events::Event::Start(ref e)) => {
141 dehtml_starttag_cb(e, &mut dehtml, &reader)
142 }
143 Ok(quick_xml::events::Event::End(ref e)) => dehtml_endtag_cb(e, &mut dehtml),
144 Ok(quick_xml::events::Event::Text(ref e)) => dehtml_text_cb(e, &mut dehtml),
145 Ok(quick_xml::events::Event::CData(e)) => {
146 str_cb(&String::from_utf8_lossy(&e as &[_]), &mut dehtml)
147 }
148 Ok(quick_xml::events::Event::Empty(ref e)) => {
149 dehtml_starttag_cb(e, &mut dehtml, &reader);
152 dehtml_endtag_cb(
153 &BytesEnd::new(String::from_utf8_lossy(e.name().as_ref())),
154 &mut dehtml,
155 );
156 }
157 Ok(quick_xml::events::Event::GeneralRef(ref e)) => {
158 match e.resolve_char_ref() {
159 Err(err) => eprintln!(
160 "resolve_char_ref() error at position {}: {:?}",
161 reader.buffer_position(),
162 err,
163 ),
164 Ok(Some(ch)) => {
165 char_buf.clear();
166 char_buf.push(ch);
167 str_cb(&char_buf, &mut dehtml);
168 }
169 Ok(None) => {
170 let event_str = String::from_utf8_lossy(e);
171 if let Some(s) = quick_xml::escape::resolve_html5_entity(&event_str) {
172 str_cb(s, &mut dehtml);
173 } else {
174 str_cb(&format!("&{event_str};"), &mut dehtml);
176 }
177 }
178 }
179 }
180 Err(QuickXmlError::IllFormed(_)) => {
181 str_cb(&String::from_utf8_lossy(&buf), &mut dehtml);
183 }
184 Err(e) => {
185 eprintln!(
186 "Parse html error: Error at position {}: {:?}",
187 reader.buffer_position(),
188 e
189 );
190 }
191 Ok(quick_xml::events::Event::Eof) => break,
192 _ => (),
193 }
194 buf.clear();
195 }
196
197 (dehtml.strbuilder, dehtml.quote)
198}
199
200fn dehtml_text_cb(event: &BytesText, dehtml: &mut Dehtml) {
201 if dehtml.get_add_text() == AddText::YesPreserveLineEnds
202 || dehtml.get_add_text() == AddText::YesRemoveLineEnds
203 {
204 let event = event as &[_];
205 let event_str = std::str::from_utf8(event).unwrap_or_default();
206 str_cb(event_str, dehtml);
207 }
208}
209
210fn str_cb(event_str: &str, dehtml: &mut Dehtml) {
211 static LINE_RE: LazyLock<regex::Regex> =
212 LazyLock::new(|| regex::Regex::new(r"(\r?\n)+").unwrap());
213
214 let add_text = dehtml.get_add_text();
215 if add_text == AddText::YesRemoveLineEnds {
216 let event_str = LINE_RE.replace_all(event_str, " ");
219
220 let buf = dehtml.get_buf();
224 if !buf.ends_with(' ') && !buf.ends_with('\n') && event_str.starts_with(' ') {
225 *buf += " ";
226 }
227
228 *buf += event_str.trim_start();
229 } else if add_text == AddText::YesPreserveLineEnds {
230 *dehtml.get_buf() += LINE_RE.replace_all(event_str, "\n").as_ref();
231 }
232}
233
234fn dehtml_endtag_cb(event: &BytesEnd, dehtml: &mut Dehtml) {
235 let tag = String::from_utf8_lossy(event.name().as_ref())
236 .trim()
237 .to_lowercase();
238
239 match tag.as_str() {
240 "style" | "script" | "title" | "pre" => {
241 *dehtml.get_buf() += "\n\n";
242 dehtml.add_text = AddText::YesRemoveLineEnds;
243 }
244 "div" => {
245 pop_tag(&mut dehtml.divs_since_quote_div);
246 pop_tag(&mut dehtml.divs_since_quoted_content_div);
247
248 *dehtml.get_buf() += "\n\n";
249 dehtml.add_text = AddText::YesRemoveLineEnds;
250 }
251 "a" => {
252 if let Some(ref last_href) = dehtml.last_href.take() {
253 let buf = dehtml.get_buf();
254 if buf.ends_with('[') {
255 buf.truncate(buf.len() - 1);
256 } else {
257 *buf += "](";
258 *buf += last_href;
259 *buf += ")";
260 }
261 }
262 }
263 "b" | "strong" => {
264 if dehtml.get_add_text() != AddText::No {
265 *dehtml.get_buf() += "*";
266 }
267 }
268 "i" | "em" => {
269 if dehtml.get_add_text() != AddText::No {
270 *dehtml.get_buf() += "_";
271 }
272 }
273 "blockquote" => pop_tag(&mut dehtml.blockquotes_since_blockquote),
274 _ => {}
275 }
276}
277
278fn dehtml_starttag_cb<B: std::io::BufRead>(
279 event: &BytesStart,
280 dehtml: &mut Dehtml,
281 reader: &quick_xml::Reader<B>,
282) {
283 let tag = String::from_utf8_lossy(event.name().as_ref())
284 .trim()
285 .to_lowercase();
286
287 match tag.as_str() {
288 "p" | "table" | "td" => {
289 if !dehtml.strbuilder.is_empty() {
290 *dehtml.get_buf() += "\n\n";
291 }
292 dehtml.add_text = AddText::YesRemoveLineEnds;
293 }
294 #[rustfmt::skip]
295 "div" => {
296 maybe_push_tag(event, reader, "quote", &mut dehtml.divs_since_quote_div);
297 maybe_push_tag(event, reader, "quoted-content", &mut dehtml.divs_since_quoted_content_div);
298
299 *dehtml.get_buf() += "\n\n";
300 dehtml.add_text = AddText::YesRemoveLineEnds;
301 }
302 "br" => {
303 *dehtml.get_buf() += "\n";
304 dehtml.add_text = AddText::YesRemoveLineEnds;
305 }
306 "style" | "script" | "title" => {
307 dehtml.add_text = AddText::No;
308 }
309 "pre" => {
310 *dehtml.get_buf() += "\n\n";
311 dehtml.add_text = AddText::YesPreserveLineEnds;
312 }
313 "a" => {
314 if let Some(href) = event
315 .html_attributes()
316 .filter_map(|attr| attr.ok())
317 .find(|attr| {
318 String::from_utf8_lossy(attr.key.as_ref())
319 .trim()
320 .to_lowercase()
321 == "href"
322 })
323 {
324 let href = href
325 .decode_and_unescape_value(reader.decoder())
326 .unwrap_or_default()
327 .to_string();
328
329 if !href.is_empty() {
330 dehtml.last_href = Some(href);
331 *dehtml.get_buf() += "[";
332 }
333 }
334 }
335 "b" | "strong" => {
336 if dehtml.get_add_text() != AddText::No {
337 *dehtml.get_buf() += "*";
338 }
339 }
340 "i" | "em" => {
341 if dehtml.get_add_text() != AddText::No {
342 *dehtml.get_buf() += "_";
343 }
344 }
345 "blockquote" => dehtml.blockquotes_since_blockquote += 1,
346 _ => {}
347 }
348}
349
350fn pop_tag(count: &mut u32) {
353 if *count > 0 {
354 *count -= 1;
355 }
356}
357
358fn maybe_push_tag(
361 event: &BytesStart,
362 reader: &Reader<impl BufRead>,
363 tag_name: &str,
364 count: &mut u32,
365) {
366 if *count > 0 || tag_contains_attr(event, reader, tag_name) {
367 *count += 1;
368 }
369}
370
371fn tag_contains_attr(event: &BytesStart, reader: &Reader<impl BufRead>, name: &str) -> bool {
372 event.attributes().any(|r| {
373 r.map(|a| {
374 a.decode_and_unescape_value(reader.decoder())
375 .map(|v| v == name)
376 .unwrap_or(false)
377 })
378 .unwrap_or(false)
379 })
380}
381
382pub fn dehtml_manually(buf: &str) -> String {
383 let mut strbuilder = String::new();
385 let mut show_next_chars = true;
386 for c in buf.chars() {
387 match c {
388 '<' => show_next_chars = false,
389 '>' => show_next_chars = true,
390 _ => {
391 if show_next_chars {
392 strbuilder.push(c)
393 }
394 }
395 }
396 }
397 strbuilder
398}
399
400#[cfg(test)]
401mod tests {
402 use super::*;
403
404 #[test]
405 fn test_dehtml() {
406 let cases = vec",
410 ),
411 ("<b> bar </b>", "* bar *"),
412 ("<i>foo</i>", "_foo_"),
413 ("<b> bar <i> foo", "* bar _ foo"),
414 ("& bar", "& bar"),
415 ("<a href='/foo.png>Hi</a> ", "Hi"),
417 ("No link: <a href='https://get.delta.chat/'/>", "No link:"),
418 (
419 "No link: <a href='https://get.delta.chat/'></a>",
420 "No link:",
421 ),
422 ("<!doctype html>\n<b>fat text</b>", "*fat text*"),
423 ("<!some invalid html code>\n<b>some text</b>", "some text"),
425 ];
426 for (input, output) in cases {
427 assert_eq!(dehtml(input).unwrap().text, output);
428 }
429 let none_cases = vec!["<html> </html>", ""];
430 for input in none_cases {
431 assert_eq!(dehtml(input), None);
432 }
433 }
434
435 #[test]
436 fn test_dehtml_parse_br() {
437 let html = "line1<br>line2";
438 let plain = dehtml(html).unwrap().text;
439 assert_eq!(plain, "line1\nline2");
440
441 let html = "line1<br> line2";
442 let plain = dehtml(html).unwrap().text;
443 assert_eq!(plain, "line1\nline2");
444
445 let html = "line1 <br><br> line2";
446 let plain = dehtml(html).unwrap().text;
447 assert_eq!(plain, "line1\n\nline2");
448
449 let html = "\r\r\nline1<br>\r\n\r\n\r\rline2<br/>line3\n\r";
450 let plain = dehtml(html).unwrap().text;
451 assert_eq!(plain, "line1\nline2\nline3");
452 }
453
454 #[test]
455 fn test_dehtml_parse_span() {
456 assert_eq!(dehtml("<span>Foo</span>bar").unwrap().text, "Foobar");
457 assert_eq!(dehtml("<span>Foo</span> bar").unwrap().text, "Foo bar");
458 assert_eq!(dehtml("<span>Foo </span>bar").unwrap().text, "Foo bar");
459 assert_eq!(dehtml("<span>Foo</span>\nbar").unwrap().text, "Foo bar");
460 assert_eq!(dehtml("\n<span>Foo</span> bar").unwrap().text, "Foo bar");
461 assert_eq!(dehtml("<span>Foo</span>\n\nbar").unwrap().text, "Foo bar");
462 assert_eq!(dehtml("Foo\n<span>bar</span>").unwrap().text, "Foo bar");
463 assert_eq!(dehtml("Foo<span>\nbar</span>").unwrap().text, "Foo bar");
464 }
465
466 #[test]
467 fn test_dehtml_parse_p() {
468 let html = "<p>Foo</p><p>Bar</p>";
469 let plain = dehtml(html).unwrap().text;
470 assert_eq!(plain, "Foo\n\nBar");
471
472 let html = "<p>Foo<p>Bar";
473 let plain = dehtml(html).unwrap().text;
474 assert_eq!(plain, "Foo\n\nBar");
475
476 let html = "<p>Foo</p><p>Bar<p>Baz";
477 let plain = dehtml(html).unwrap().text;
478 assert_eq!(plain, "Foo\n\nBar\n\nBaz");
479 }
480
481 #[test]
482 fn test_dehtml_parse_href() {
483 let html = "<a href=url>text</a>";
484 let plain = dehtml(html).unwrap().text;
485
486 assert_eq!(plain, "[text](url)");
487 }
488
489 #[test]
490 fn test_dehtml_case_sensitive_link() {
491 let html = "<html><A HrEf=\"https://foo.bar/Data\">case in URLs matter</A></html>";
492 let plain = dehtml(html).unwrap().text;
493 assert_eq!(plain, "[case in URLs matter](https://foo.bar/Data)");
494 }
495
496 #[test]
497 fn test_dehtml_bold_text() {
498 let html = "<!DOCTYPE name [<!DOCTYPE ...>]><!-- comment -->text <b><?php echo ... ?>bold</b><![CDATA[<>]]>";
499 let plain = dehtml(html).unwrap().text;
500
501 assert_eq!(plain, "text *bold*<>");
502 }
503
504 #[test]
505 fn test_dehtml_html_encoded() {
506 let html = "<>"'& äÄöÖüÜß fooÆçÇ ♦‎‏‌&noent;‍";
507
508 let plain = dehtml(html).unwrap().text;
509
510 assert_eq!(
511 plain,
512 "<>\"\'& äÄöÖüÜß fooÆçÇ \u{2666}\u{200e}\u{200f}\u{200c}&noent;\u{200d}"
513 );
514 }
515
516 #[test]
517 fn test_unclosed_tags() {
518 let input = r##"
519 <!DOCTYPE HTML PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN'
520 'http://www.w3.org/TR/html4/loose.dtd'>
521 <html>
522 <head>
523 <title>Hi</title>
524 <meta http-equiv='Content-Type' content='text/html; charset=iso-8859-1'>
525 </head>
526 <body>
527 lots of text
528 </body>
529 </html>
530 "##;
531 let txt = dehtml(input).unwrap();
532 assert_eq!(txt.text.trim(), "lots of text");
533 }
534
535 #[test]
536 fn test_pre_tag() {
537 let input = "<html><pre>\ntwo\nlines\n</pre></html>";
538 let txt = dehtml(input).unwrap();
539 assert_eq!(txt.text.trim(), "two\nlines");
540 }
541
542 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
543 async fn test_quote_div() {
544 let input = include_str!("../test-data/message/gmx-quote-body.eml");
545 let dehtml = dehtml(input).unwrap();
546 let SimplifiedText {
547 text,
548 is_forwarded,
549 is_cut,
550 top_quote,
551 footer,
552 } = dehtml;
553 assert_eq!(text, "Test");
554 assert_eq!(is_forwarded, false);
555 assert_eq!(is_cut, false);
556 assert_eq!(top_quote.as_deref(), Some("test"));
557 assert_eq!(footer, None);
558 }
559
560 #[test]
561 fn test_spaces() {
562 let input = include_str!("../test-data/spaces.html");
563 let txt = dehtml(input).unwrap();
564 assert_eq!(
565 txt.text,
566 "Welcome back to Strolling!\n\nHey there,\n\nWelcome back! Use this link to securely sign in to your Strolling account:\n\nSign in to Strolling\n\nFor your security, the link will expire in 24 hours time.\n\nSee you soon!\n\nYou can also copy & paste this URL into your browser:\n\nhttps://strolling.rosano.ca/members/?token=XXX&action=signin&r=https%3A%2F%2Fstrolling.rosano.ca%2F\n\nIf you did not make this request, you can safely ignore this email.\n\nThis message was sent from [strolling.rosano.ca](https://strolling.rosano.ca/) to [alice@example.org](mailto:alice@example.org)"
567 );
568 }
569}