1use std::io::BufRead;
6use std::sync::LazyLock;
7
8use quick_xml::{
9 events::{BytesEnd, BytesStart, BytesText},
10 Reader,
11};
12
13use crate::simplify::{simplify_quote, SimplifiedText};
14
15struct Dehtml {
16 strbuilder: String,
17 quote: String,
18 add_text: AddText,
19 last_href: Option<String>,
20 divs_since_quote_div: u32,
24 divs_since_quoted_content_div: u32,
27 blockquotes_since_blockquote: u32,
30}
31
32impl Dehtml {
33 fn is_quote(&self) -> bool {
35 self.divs_since_quoted_content_div > 0 || self.blockquotes_since_blockquote > 0
36 }
37
38 fn get_buf(&mut self) -> &mut String {
42 if self.is_quote() {
43 &mut self.quote
44 } else {
45 &mut self.strbuilder
46 }
47 }
48
49 fn get_add_text(&self) -> AddText {
50 if self.divs_since_quote_div > 0 && self.divs_since_quoted_content_div == 0 {
51 AddText::No } else {
53 self.add_text
54 }
55 }
56}
57
58#[derive(Debug, PartialEq, Clone, Copy)]
59enum AddText {
60 No,
63
64 YesRemoveLineEnds,
65
66 YesPreserveLineEnds,
68}
69
70pub(crate) fn dehtml(buf: &str) -> Option<SimplifiedText> {
71 let (s, quote) = dehtml_quick_xml(buf);
72 if !s.trim().is_empty() {
73 let text = dehtml_cleanup(s);
74 let top_quote = if !quote.trim().is_empty() {
75 Some(dehtml_cleanup(simplify_quote("e).0))
76 } else {
77 None
78 };
79 return Some(SimplifiedText {
80 text,
81 top_quote,
82 ..Default::default()
83 });
84 }
85 let s = dehtml_manually(buf);
86 if !s.trim().is_empty() {
87 let text = dehtml_cleanup(s);
88 return Some(SimplifiedText {
89 text,
90 ..Default::default()
91 });
92 }
93 None
94}
95
96fn dehtml_cleanup(mut text: String) -> String {
97 text.retain(|c| c != '\r');
98 let lines = text.trim().split('\n');
99 let mut text = String::new();
100 let mut linebreak = false;
101 for line in lines {
102 if line.chars().all(char::is_whitespace) {
103 linebreak = true;
104 } else {
105 if !text.is_empty() {
106 text += "\n";
107 if linebreak {
108 text += "\n";
109 }
110 }
111 text += line.trim_end();
112 linebreak = false;
113 }
114 }
115 text
116}
117
118fn dehtml_quick_xml(buf: &str) -> (String, String) {
119 let buf = buf.trim().trim_start_matches("<!doctype html>");
120
121 let mut dehtml = Dehtml {
122 strbuilder: String::with_capacity(buf.len()),
123 quote: String::new(),
124 add_text: AddText::YesRemoveLineEnds,
125 last_href: None,
126 divs_since_quote_div: 0,
127 divs_since_quoted_content_div: 0,
128 blockquotes_since_blockquote: 0,
129 };
130
131 let mut reader = quick_xml::Reader::from_str(buf);
132 reader.config_mut().check_end_names = false;
133
134 let mut buf = Vec::new();
135
136 loop {
137 match reader.read_event_into(&mut buf) {
138 Ok(quick_xml::events::Event::Start(ref e)) => {
139 dehtml_starttag_cb(e, &mut dehtml, &reader)
140 }
141 Ok(quick_xml::events::Event::End(ref e)) => dehtml_endtag_cb(e, &mut dehtml),
142 Ok(quick_xml::events::Event::Text(ref e)) => dehtml_text_cb(e, &mut dehtml),
143 Ok(quick_xml::events::Event::CData(e)) => match e.escape() {
144 Ok(e) => dehtml_text_cb(&e, &mut dehtml),
145 Err(e) => {
146 eprintln!(
147 "CDATA escape error at position {}: {:?}",
148 reader.buffer_position(),
149 e,
150 );
151 }
152 },
153 Ok(quick_xml::events::Event::Empty(ref e)) => {
154 dehtml_starttag_cb(e, &mut dehtml, &reader);
157 dehtml_endtag_cb(
158 &BytesEnd::new(String::from_utf8_lossy(e.name().as_ref())),
159 &mut dehtml,
160 );
161 }
162 Err(e) => {
163 eprintln!(
164 "Parse html error: Error at position {}: {:?}",
165 reader.buffer_position(),
166 e
167 );
168 }
169 Ok(quick_xml::events::Event::Eof) => break,
170 _ => (),
171 }
172 buf.clear();
173 }
174
175 (dehtml.strbuilder, dehtml.quote)
176}
177
178fn dehtml_text_cb(event: &BytesText, dehtml: &mut Dehtml) {
179 static LINE_RE: LazyLock<regex::Regex> =
180 LazyLock::new(|| regex::Regex::new(r"(\r?\n)+").unwrap());
181
182 if dehtml.get_add_text() == AddText::YesPreserveLineEnds
183 || dehtml.get_add_text() == AddText::YesRemoveLineEnds
184 {
185 let event = event as &[_];
186 let event_str = std::str::from_utf8(event).unwrap_or_default();
187 let mut last_added = escaper::decode_html_buf_sloppy(event).unwrap_or_default();
188 if event_str.starts_with(&last_added) {
189 last_added = event_str.to_string();
190 }
191
192 if dehtml.get_add_text() == AddText::YesRemoveLineEnds {
193 let last_added = LINE_RE.replace_all(&last_added, " ");
196
197 let buf = dehtml.get_buf();
201 if !buf.ends_with(' ') && !buf.ends_with('\n') && last_added.starts_with(' ') {
202 *buf += " ";
203 }
204
205 *buf += last_added.trim_start();
206 } else {
207 *dehtml.get_buf() += LINE_RE.replace_all(&last_added, "\n").as_ref();
208 }
209 }
210}
211
212fn dehtml_endtag_cb(event: &BytesEnd, dehtml: &mut Dehtml) {
213 let tag = String::from_utf8_lossy(event.name().as_ref())
214 .trim()
215 .to_lowercase();
216
217 match tag.as_str() {
218 "style" | "script" | "title" | "pre" => {
219 *dehtml.get_buf() += "\n\n";
220 dehtml.add_text = AddText::YesRemoveLineEnds;
221 }
222 "div" => {
223 pop_tag(&mut dehtml.divs_since_quote_div);
224 pop_tag(&mut dehtml.divs_since_quoted_content_div);
225
226 *dehtml.get_buf() += "\n\n";
227 dehtml.add_text = AddText::YesRemoveLineEnds;
228 }
229 "a" => {
230 if let Some(ref last_href) = dehtml.last_href.take() {
231 let buf = dehtml.get_buf();
232 if buf.ends_with('[') {
233 buf.truncate(buf.len() - 1);
234 } else {
235 *buf += "](";
236 *buf += last_href;
237 *buf += ")";
238 }
239 }
240 }
241 "b" | "strong" => {
242 if dehtml.get_add_text() != AddText::No {
243 *dehtml.get_buf() += "*";
244 }
245 }
246 "i" | "em" => {
247 if dehtml.get_add_text() != AddText::No {
248 *dehtml.get_buf() += "_";
249 }
250 }
251 "blockquote" => pop_tag(&mut dehtml.blockquotes_since_blockquote),
252 _ => {}
253 }
254}
255
256fn dehtml_starttag_cb<B: std::io::BufRead>(
257 event: &BytesStart,
258 dehtml: &mut Dehtml,
259 reader: &quick_xml::Reader<B>,
260) {
261 let tag = String::from_utf8_lossy(event.name().as_ref())
262 .trim()
263 .to_lowercase();
264
265 match tag.as_str() {
266 "p" | "table" | "td" => {
267 if !dehtml.strbuilder.is_empty() {
268 *dehtml.get_buf() += "\n\n";
269 }
270 dehtml.add_text = AddText::YesRemoveLineEnds;
271 }
272 #[rustfmt::skip]
273 "div" => {
274 maybe_push_tag(event, reader, "quote", &mut dehtml.divs_since_quote_div);
275 maybe_push_tag(event, reader, "quoted-content", &mut dehtml.divs_since_quoted_content_div);
276
277 *dehtml.get_buf() += "\n\n";
278 dehtml.add_text = AddText::YesRemoveLineEnds;
279 }
280 "br" => {
281 *dehtml.get_buf() += "\n";
282 dehtml.add_text = AddText::YesRemoveLineEnds;
283 }
284 "style" | "script" | "title" => {
285 dehtml.add_text = AddText::No;
286 }
287 "pre" => {
288 *dehtml.get_buf() += "\n\n";
289 dehtml.add_text = AddText::YesPreserveLineEnds;
290 }
291 "a" => {
292 if let Some(href) = event
293 .html_attributes()
294 .filter_map(|attr| attr.ok())
295 .find(|attr| {
296 String::from_utf8_lossy(attr.key.as_ref())
297 .trim()
298 .to_lowercase()
299 == "href"
300 })
301 {
302 let href = href
303 .decode_and_unescape_value(reader.decoder())
304 .unwrap_or_default()
305 .to_string();
306
307 if !href.is_empty() {
308 dehtml.last_href = Some(href);
309 *dehtml.get_buf() += "[";
310 }
311 }
312 }
313 "b" | "strong" => {
314 if dehtml.get_add_text() != AddText::No {
315 *dehtml.get_buf() += "*";
316 }
317 }
318 "i" | "em" => {
319 if dehtml.get_add_text() != AddText::No {
320 *dehtml.get_buf() += "_";
321 }
322 }
323 "blockquote" => dehtml.blockquotes_since_blockquote += 1,
324 _ => {}
325 }
326}
327
328fn pop_tag(count: &mut u32) {
331 if *count > 0 {
332 *count -= 1;
333 }
334}
335
336fn maybe_push_tag(
339 event: &BytesStart,
340 reader: &Reader<impl BufRead>,
341 tag_name: &str,
342 count: &mut u32,
343) {
344 if *count > 0 || tag_contains_attr(event, reader, tag_name) {
345 *count += 1;
346 }
347}
348
349fn tag_contains_attr(event: &BytesStart, reader: &Reader<impl BufRead>, name: &str) -> bool {
350 event.attributes().any(|r| {
351 r.map(|a| {
352 a.decode_and_unescape_value(reader.decoder())
353 .map(|v| v == name)
354 .unwrap_or(false)
355 })
356 .unwrap_or(false)
357 })
358}
359
360pub fn dehtml_manually(buf: &str) -> String {
361 let mut strbuilder = String::new();
363 let mut show_next_chars = true;
364 for c in buf.chars() {
365 match c {
366 '<' => show_next_chars = false,
367 '>' => show_next_chars = true,
368 _ => {
369 if show_next_chars {
370 strbuilder.push(c)
371 }
372 }
373 }
374 }
375 strbuilder
376}
377
378#[cfg(test)]
379mod tests {
380 use super::*;
381
382 #[test]
383 fn test_dehtml() {
384 let cases = vec",
388 ),
389 ("<b> bar </b>", "* bar *"),
390 ("<i>foo</i>", "_foo_"),
391 ("<b> bar <i> foo", "* bar _ foo"),
392 ("& bar", "& bar"),
393 ("<a href='/foo.png>Hi</a> ", "Hi"),
395 ("No link: <a href='https://get.delta.chat/'/>", "No link:"),
396 (
397 "No link: <a href='https://get.delta.chat/'></a>",
398 "No link:",
399 ),
400 ("<!doctype html>\n<b>fat text</b>", "*fat text*"),
401 ("<!some invalid html code>\n<b>some text</b>", "some text"),
403 ];
404 for (input, output) in cases {
405 assert_eq!(dehtml(input).unwrap().text, output);
406 }
407 let none_cases = vec!["<html> </html>", ""];
408 for input in none_cases {
409 assert_eq!(dehtml(input), None);
410 }
411 }
412
413 #[test]
414 fn test_dehtml_parse_br() {
415 let html = "line1<br>line2";
416 let plain = dehtml(html).unwrap().text;
417 assert_eq!(plain, "line1\nline2");
418
419 let html = "line1<br> line2";
420 let plain = dehtml(html).unwrap().text;
421 assert_eq!(plain, "line1\nline2");
422
423 let html = "line1 <br><br> line2";
424 let plain = dehtml(html).unwrap().text;
425 assert_eq!(plain, "line1\n\nline2");
426
427 let html = "\r\r\nline1<br>\r\n\r\n\r\rline2<br/>line3\n\r";
428 let plain = dehtml(html).unwrap().text;
429 assert_eq!(plain, "line1\nline2\nline3");
430 }
431
432 #[test]
433 fn test_dehtml_parse_span() {
434 assert_eq!(dehtml("<span>Foo</span>bar").unwrap().text, "Foobar");
435 assert_eq!(dehtml("<span>Foo</span> bar").unwrap().text, "Foo bar");
436 assert_eq!(dehtml("<span>Foo </span>bar").unwrap().text, "Foo bar");
437 assert_eq!(dehtml("<span>Foo</span>\nbar").unwrap().text, "Foo bar");
438 assert_eq!(dehtml("\n<span>Foo</span> bar").unwrap().text, "Foo bar");
439 assert_eq!(dehtml("<span>Foo</span>\n\nbar").unwrap().text, "Foo bar");
440 assert_eq!(dehtml("Foo\n<span>bar</span>").unwrap().text, "Foo bar");
441 assert_eq!(dehtml("Foo<span>\nbar</span>").unwrap().text, "Foo bar");
442 }
443
444 #[test]
445 fn test_dehtml_parse_p() {
446 let html = "<p>Foo</p><p>Bar</p>";
447 let plain = dehtml(html).unwrap().text;
448 assert_eq!(plain, "Foo\n\nBar");
449
450 let html = "<p>Foo<p>Bar";
451 let plain = dehtml(html).unwrap().text;
452 assert_eq!(plain, "Foo\n\nBar");
453
454 let html = "<p>Foo</p><p>Bar<p>Baz";
455 let plain = dehtml(html).unwrap().text;
456 assert_eq!(plain, "Foo\n\nBar\n\nBaz");
457 }
458
459 #[test]
460 fn test_dehtml_parse_href() {
461 let html = "<a href=url>text</a>";
462 let plain = dehtml(html).unwrap().text;
463
464 assert_eq!(plain, "[text](url)");
465 }
466
467 #[test]
468 fn test_dehtml_case_sensitive_link() {
469 let html = "<html><A HrEf=\"https://foo.bar/Data\">case in URLs matter</A></html>";
470 let plain = dehtml(html).unwrap().text;
471 assert_eq!(plain, "[case in URLs matter](https://foo.bar/Data)");
472 }
473
474 #[test]
475 fn test_dehtml_bold_text() {
476 let html = "<!DOCTYPE name [<!DOCTYPE ...>]><!-- comment -->text <b><?php echo ... ?>bold</b><![CDATA[<>]]>";
477 let plain = dehtml(html).unwrap().text;
478
479 assert_eq!(plain, "text *bold*<>");
480 }
481
482 #[test]
483 fn test_dehtml_html_encoded() {
484 let html =
485 "<>"'& äÄöÖüÜß fooÆçÇ ♦‎‏‌&noent;‍";
486
487 let plain = dehtml(html).unwrap().text;
488
489 assert_eq!(
490 plain,
491 "<>\"\'& äÄöÖüÜß fooÆçÇ \u{2666}\u{200e}\u{200f}\u{200c}&noent;\u{200d}"
492 );
493 }
494
495 #[test]
496 fn test_unclosed_tags() {
497 let input = r##"
498 <!DOCTYPE HTML PUBLIC '-//W3C//DTD HTML 4.01 Transitional//EN'
499 'http://www.w3.org/TR/html4/loose.dtd'>
500 <html>
501 <head>
502 <title>Hi</title>
503 <meta http-equiv='Content-Type' content='text/html; charset=iso-8859-1'>
504 </head>
505 <body>
506 lots of text
507 </body>
508 </html>
509 "##;
510 let txt = dehtml(input).unwrap();
511 assert_eq!(txt.text.trim(), "lots of text");
512 }
513
514 #[test]
515 fn test_pre_tag() {
516 let input = "<html><pre>\ntwo\nlines\n</pre></html>";
517 let txt = dehtml(input).unwrap();
518 assert_eq!(txt.text.trim(), "two\nlines");
519 }
520
521 #[tokio::test(flavor = "multi_thread", worker_threads = 2)]
522 async fn test_quote_div() {
523 let input = include_str!("../test-data/message/gmx-quote-body.eml");
524 let dehtml = dehtml(input).unwrap();
525 let SimplifiedText {
526 text,
527 is_forwarded,
528 is_cut,
529 top_quote,
530 footer,
531 } = dehtml;
532 assert_eq!(text, "Test");
533 assert_eq!(is_forwarded, false);
534 assert_eq!(is_cut, false);
535 assert_eq!(top_quote.as_deref(), Some("test"));
536 assert_eq!(footer, None);
537 }
538
539 #[test]
540 fn test_spaces() {
541 let input = include_str!("../test-data/spaces.html");
542 let txt = dehtml(input).unwrap();
543 assert_eq!(txt.text, "Welcome back to Strolling!\n\nHey there,\n\nWelcome back! Use this link to securely sign in to your Strolling account:\n\nSign in to Strolling\n\nFor your security, the link will expire in 24 hours time.\n\nSee you soon!\n\nYou can also copy & paste this URL into your browser:\n\nhttps://strolling.rosano.ca/members/?token=XXX&action=signin&r=https%3A%2F%2Fstrolling.rosano.ca%2F\n\nIf you did not make this request, you can safely ignore this email.\n\nThis message was sent from [strolling.rosano.ca](https://strolling.rosano.ca/) to [alice@example.org](mailto:alice@example.org)");
544 }
545}