term_transcript/test/parser/
text.rs

1//! Text parsing.
2
3use std::{borrow::Cow, fmt, io::Write, mem, ops, str};
4
5use quick_xml::{
6    escape::{resolve_xml_entity, EscapeError},
7    events::{attributes::Attributes, BytesStart, Event},
8};
9use termcolor::{Color, ColorSpec, WriteColor};
10
11use super::{extract_base_class, map_utf8_error, parse_classes, ParseError, Parsed};
12use crate::{
13    test::color_diff::ColorSpansWriter,
14    utils::{normalize_newlines, RgbColor},
15};
16
17#[derive(Debug)]
18enum HardBreak {
19    Active,
20    JustEnded,
21}
22
23pub(super) struct TextReadingState {
24    pub plaintext_buffer: String,
25    color_spans_writer: ColorSpansWriter,
26    open_tags: usize,
27    bg_line_level: Option<usize>,
28    hard_br: Option<HardBreak>,
29}
30
31impl fmt::Debug for TextReadingState {
32    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
33        formatter
34            .debug_struct("TextReadingState")
35            .field("plaintext_buffer", &self.plaintext_buffer)
36            .finish_non_exhaustive()
37    }
38}
39
40impl Default for TextReadingState {
41    fn default() -> Self {
42        Self {
43            color_spans_writer: ColorSpansWriter::default(),
44            plaintext_buffer: String::new(),
45            open_tags: 1,
46            bg_line_level: None,
47            hard_br: None,
48        }
49    }
50}
51
52impl TextReadingState {
53    pub(super) fn is_empty(&self) -> bool {
54        self.plaintext_buffer.is_empty()
55    }
56
57    pub(super) fn open_tags(&self) -> usize {
58        self.open_tags
59    }
60
61    fn should_ignore_text(&self) -> bool {
62        self.bg_line_level.is_some() || self.hard_br.is_some()
63    }
64
65    // We only retain `<span>` tags in the HTML since they are the only ones containing color info.
66    #[allow(clippy::too_many_lines)]
67    pub(super) fn process(
68        &mut self,
69        event: Event<'_>,
70        position: ops::Range<usize>,
71    ) -> Result<Option<Parsed>, ParseError> {
72        let after_hard_break = matches!(self.hard_br, Some(HardBreak::JustEnded));
73        if after_hard_break
74            && matches!(
75                &event,
76                Event::Text(_) | Event::GeneralRef(_) | Event::Start(_)
77            )
78        {
79            self.hard_br = None;
80        }
81
82        match event {
83            Event::Text(text) => {
84                if self.should_ignore_text() {
85                    return Ok(None);
86                }
87
88                let unescaped_str = text.decode().map_err(quick_xml::Error::from)?;
89                let unescaped_str = normalize_newlines(&unescaped_str);
90                let unescaped_str = if after_hard_break && unescaped_str.starts_with('\n') {
91                    &unescaped_str[1..] // gobble the starting '\n' as produced by a hard break
92                } else {
93                    &unescaped_str
94                };
95                self.push_text(unescaped_str);
96            }
97            Event::GeneralRef(reference) => {
98                if self.should_ignore_text() {
99                    return Ok(None);
100                }
101
102                let maybe_char = reference.resolve_char_ref()?;
103                let mut char_buffer = [0_u8; 4];
104                let decoded = if let Some(c) = maybe_char {
105                    c.encode_utf8(&mut char_buffer)
106                } else {
107                    let decoded = reference.decode().map_err(quick_xml::Error::from)?;
108                    resolve_xml_entity(&decoded).ok_or_else(|| {
109                        let err = EscapeError::UnrecognizedEntity(position, decoded.into_owned());
110                        quick_xml::Error::from(err)
111                    })?
112                };
113                self.push_text(decoded);
114            }
115            Event::Start(tag) => {
116                self.open_tags += 1;
117                if self.bg_line_level.is_some() {
118                    return Ok(None);
119                } else if self.hard_br.is_some() {
120                    return Err(ParseError::InvalidHardBreak);
121                }
122
123                let tag_name = tag.name();
124                // FIXME: remove bg line logic (no longer necessary)
125                if tag_name.as_ref() == b"text" && Self::is_bg_line(tag.attributes())? {
126                    self.bg_line_level = Some(self.open_tags - 1);
127                    return Ok(None);
128                }
129                // Check for the hard line break <tspan> or <b>. We mustn't add its contents to the text,
130                // and instead gobble the following '\n'.
131                let classes = parse_classes(tag.attributes())?;
132                if extract_base_class(&classes) == b"hard-br" {
133                    self.hard_br = Some(HardBreak::Active);
134                    return Ok(None);
135                }
136
137                if Self::is_text_span(tag_name.as_ref()) {
138                    let color_spec = Self::parse_color_from_span(&tag)?;
139                    if !color_spec.is_none() {
140                        self.color_spans_writer
141                            .set_color(&color_spec)
142                            .expect("cannot set color for ANSI buffer");
143                    }
144                }
145            }
146            Event::End(tag) => {
147                self.open_tags -= 1;
148                if let Some(level) = self.bg_line_level {
149                    debug_assert!(level <= self.open_tags);
150                    if self.open_tags == level {
151                        self.bg_line_level = None;
152                    }
153                    return Ok(None);
154                } else if matches!(self.hard_br, Some(HardBreak::Active)) {
155                    self.hard_br = Some(HardBreak::JustEnded);
156                    return Ok(None);
157                }
158
159                if Self::is_text_span(tag.name().as_ref()) {
160                    // FIXME: check embedded color specs (should never be produced).
161                    self.color_spans_writer
162                        .reset()
163                        .expect("cannot reset color for ANSI buffer");
164                }
165
166                if self.open_tags == 0 {
167                    let plaintext = mem::take(&mut self.plaintext_buffer);
168                    let color_spans = mem::take(&mut self.color_spans_writer).into_inner();
169                    let mut parsed = Parsed {
170                        plaintext,
171                        color_spans,
172                    };
173                    parsed.trim_ending_newline();
174                    return Ok(Some(parsed));
175                }
176            }
177            _ => { /* Do nothing */ }
178        }
179        Ok(None)
180    }
181
182    fn is_text_span(tag: &[u8]) -> bool {
183        matches!(tag, b"span" | b"tspan" | b"text")
184    }
185
186    fn push_text(&mut self, text: &str) {
187        self.plaintext_buffer.push_str(text);
188        self.color_spans_writer
189            .write_all(text.as_bytes())
190            .expect("cannot write to ANSI buffer");
191    }
192
193    fn is_bg_line(attrs: Attributes<'_>) -> Result<bool, ParseError> {
194        let classes = parse_classes(attrs)?;
195        Ok(extract_base_class(&classes) == b"output-bg")
196    }
197
198    /// Parses color spec from a `span`.
199    ///
200    /// **NB.** Must correspond to the span creation logic in the `html` module.
201    fn parse_color_from_span(span_tag: &BytesStart) -> Result<ColorSpec, ParseError> {
202        let class_attr = parse_classes(span_tag.attributes())?;
203        let mut color_spec = ColorSpec::new();
204        Self::parse_color_from_classes(&mut color_spec, &class_attr);
205
206        let mut style = Cow::Borrowed(&[] as &[u8]);
207        for attr in span_tag.attributes() {
208            let attr = attr.map_err(quick_xml::Error::InvalidAttr)?;
209            if attr.key.as_ref() == b"style" {
210                style = attr.value;
211            }
212        }
213        Self::parse_color_from_style(&mut color_spec, &style)?;
214
215        Ok(color_spec)
216    }
217
218    fn parse_color_from_classes(color_spec: &mut ColorSpec, class_attr: &[u8]) {
219        let classes = class_attr.split(u8::is_ascii_whitespace);
220        for class in classes {
221            // Note that `class` may be empty because of multiple sequential whitespace chars.
222            // This is OK for us.
223            match class {
224                b"bold" => {
225                    color_spec.set_bold(true);
226                }
227                b"dimmed" => {
228                    color_spec.set_dimmed(true);
229                }
230                b"italic" => {
231                    color_spec.set_italic(true);
232                }
233                b"underline" => {
234                    color_spec.set_underline(true);
235                }
236
237                // Indexed foreground color candidate.
238                fg if fg.starts_with(b"fg") => {
239                    if let Some(color) = Self::parse_indexed_color(&fg[2..]) {
240                        color_spec.set_fg(Some(color));
241                    }
242                }
243                // Indexed background color candidate.
244                bg if bg.starts_with(b"bg") => {
245                    if let Some(color) = Self::parse_indexed_color(&bg[2..]) {
246                        color_spec.set_bg(Some(color));
247                    } else if let Ok(color_str) = str::from_utf8(&bg[2..]) {
248                        // Parse `bg#..` classes produced by the pure SVG template
249                        if let Ok(color) = color_str.parse::<RgbColor>() {
250                            color_spec.set_bg(Some(color.into_ansi_color()));
251                        }
252                    }
253                }
254
255                _ => { /* Ignore other classes. */ }
256            }
257        }
258    }
259
260    // **NB.** This parser is pretty rudimentary (e.g., does not understand comments).
261    fn parse_color_from_style(color_spec: &mut ColorSpec, style: &[u8]) -> Result<(), ParseError> {
262        for style_property in style.split(|&ch| ch == b';') {
263            let name_and_value: Vec<_> = style_property.splitn(2, |&ch| ch == b':').collect();
264            let [property_name, property_value] = name_and_value.as_slice() else {
265                continue;
266            };
267
268            let property_name = str::from_utf8(property_name)
269                .map_err(map_utf8_error)?
270                .trim();
271            let property_value = str::from_utf8(property_value)
272                .map_err(map_utf8_error)?
273                .trim();
274
275            match property_name {
276                "color" | "fill" => {
277                    if let Ok(color) = property_value.parse::<RgbColor>() {
278                        color_spec.set_fg(Some(color.into_ansi_color()));
279                    }
280                }
281                "background" | "background-color" => {
282                    if let Ok(color) = property_value.parse::<RgbColor>() {
283                        color_spec.set_bg(Some(color.into_ansi_color()));
284                    }
285                }
286                _ => { /* Ignore other properties. */ }
287            }
288        }
289        Ok(())
290    }
291
292    fn parse_indexed_color(class: &[u8]) -> Option<Color> {
293        Some(match class {
294            b"0" => Color::Black,
295            b"1" => Color::Red,
296            b"2" => Color::Green,
297            b"3" => Color::Yellow,
298            b"4" => Color::Blue,
299            b"5" => Color::Magenta,
300            b"6" => Color::Cyan,
301            b"7" => Color::White,
302            b"8" | b"9" => Color::Ansi256(class[0] - b'0'),
303            b"10" | b"11" | b"12" | b"13" | b"14" | b"15" => Color::Ansi256(10 + class[1] - b'0'),
304            _ => return None,
305        })
306    }
307}
308
309impl RgbColor {
310    fn into_ansi_color(self) -> Color {
311        Color::Rgb(self.0, self.1, self.2)
312    }
313}
314
315#[cfg(test)]
316mod tests {
317    use super::*;
318
319    #[test]
320    fn parsing_color_index() {
321        assert_eq!(
322            TextReadingState::parse_indexed_color(b"0"),
323            Some(Color::Black)
324        );
325        assert_eq!(
326            TextReadingState::parse_indexed_color(b"3"),
327            Some(Color::Yellow)
328        );
329        assert_eq!(
330            TextReadingState::parse_indexed_color(b"9"),
331            Some(Color::Ansi256(9))
332        );
333        assert_eq!(
334            TextReadingState::parse_indexed_color(b"10"),
335            Some(Color::Ansi256(10))
336        );
337        assert_eq!(
338            TextReadingState::parse_indexed_color(b"15"),
339            Some(Color::Ansi256(15))
340        );
341
342        assert_eq!(TextReadingState::parse_indexed_color(b""), None);
343        assert_eq!(TextReadingState::parse_indexed_color(b"17"), None);
344        assert_eq!(TextReadingState::parse_indexed_color(b"01"), None);
345        assert_eq!(TextReadingState::parse_indexed_color(b"333"), None);
346    }
347
348    #[test]
349    fn parsing_color_from_classes() {
350        let mut color_spec = ColorSpec::new();
351        TextReadingState::parse_color_from_classes(&mut color_spec, b"bold fg3 underline bg11");
352
353        assert!(color_spec.bold(), "{color_spec:?}");
354        assert!(color_spec.underline(), "{color_spec:?}");
355        assert_eq!(color_spec.fg(), Some(&Color::Yellow));
356        assert_eq!(color_spec.bg(), Some(&Color::Ansi256(11)));
357    }
358
359    #[test]
360    fn parsing_color_from_style() {
361        let mut color_spec = ColorSpec::new();
362        TextReadingState::parse_color_from_style(
363            &mut color_spec,
364            b"color: #fed; background: #c0ffee",
365        )
366        .unwrap();
367
368        assert_eq!(color_spec.fg(), Some(&Color::Rgb(0xff, 0xee, 0xdd)));
369        assert_eq!(color_spec.bg(), Some(&Color::Rgb(0xc0, 0xff, 0xee)));
370    }
371
372    #[test]
373    fn parsing_color_from_style_with_terminal_semicolon() {
374        let mut color_spec = ColorSpec::new();
375        TextReadingState::parse_color_from_style(
376            &mut color_spec,
377            b"color: #fed; background: #c0ffee;",
378        )
379        .unwrap();
380
381        assert_eq!(color_spec.fg(), Some(&Color::Rgb(0xff, 0xee, 0xdd)));
382        assert_eq!(color_spec.bg(), Some(&Color::Rgb(0xc0, 0xff, 0xee)));
383    }
384
385    #[test]
386    fn parsing_fg_color_from_svg_style() {
387        let mut color_spec = ColorSpec::new();
388        TextReadingState::parse_color_from_style(&mut color_spec, b"fill: #fed; stroke: #fed")
389            .unwrap();
390
391        assert_eq!(color_spec.fg(), Some(&Color::Rgb(0xff, 0xee, 0xdd)));
392        assert_eq!(color_spec.bg(), None);
393    }
394
395    #[test]
396    fn parsing_bg_color_from_svg_style() {
397        let mut color_spec = ColorSpec::new();
398        TextReadingState::parse_color_from_classes(&mut color_spec, b"bold fg3 bg#d7d75f");
399        assert!(color_spec.bold());
400        assert_eq!(color_spec.fg(), Some(&Color::Yellow));
401        assert_eq!(color_spec.bg(), Some(&Color::Rgb(0xd7, 0xd7, 0x5f)));
402    }
403}