Skip to main content

afm_markdown/
source_line_anchors.rs

1//! Source-line anchor injection for the HTML renderer.
2//!
3//! When `Options::source_line_anchors` is `true`, top-level block
4//! elements in the rendered HTML get a `data-afm-source-line="N"`
5//! attribute (1-based) pointing back at the source line where the
6//! block began. The afm-obsidian document-mode adapter (Pillar 6)
7//! relies on this to map Obsidian's per-block post-processor calls
8//! back to slices of the full rendered fragment.
9//!
10//! Algorithm:
11//!
12//! 1. Walk comrak's top-level AST children (`root.children()`) and
13//!    collect each child's source position from
14//!    `node.data.borrow().sourcepos.start.line`.
15//! 2. After comrak emits the HTML string, scan it once and inject
16//!    `data-afm-source-line="N"` into the *first* opening tag at
17//!    each top-level block boundary. Top-level blocks are
18//!    identified positionally — the Nth top-level element in the
19//!    HTML corresponds to the Nth child of the comrak root.
20//!
21//! Why string-level injection rather than custom rendering: comrak's
22//! `format_html` doesn't expose a per-node attribute hook on every
23//! block kind we care about. A scan-and-inject pass is O(html) and
24//! deterministic; the per-element overhead is dwarfed by the comrak
25//! formatter's own cost.
26
27use comrak::nodes::AstNode;
28
29/// 1-based source line for each top-level child, in document order.
30pub(crate) fn collect_top_level_lines<'a>(root: &'a AstNode<'a>) -> Vec<usize> {
31    let mut out = Vec::new();
32    for child in root.children() {
33        let line = child.data.borrow().sourcepos.start.line;
34        // sourcepos is 1-based but defaults to 0 for synthetic nodes;
35        // clamp to >=1 so the attribute value is always meaningful.
36        out.push(line.max(1));
37    }
38    out
39}
40
41/// Insert `data-afm-source-line="N"` into the first opening tag at
42/// each top-level block boundary. Tags considered top-level are
43/// `<p>`, `<h1..h6>`, `<ul>`, `<ol>`, `<blockquote>`, `<pre>`,
44/// `<table>`, `<hr>`, `<div>` (containers).
45pub(crate) fn inject_anchors(html: &str, lines: &[usize]) -> String {
46    if lines.is_empty() {
47        return html.to_owned();
48    }
49    let mut out = String::with_capacity(html.len() + lines.len() * 24);
50    let mut idx = 0_usize;
51    let bytes = html.as_bytes();
52    let mut next_line = 0_usize;
53    let mut depth: i32 = 0;
54    while idx < bytes.len() {
55        let b = bytes[idx];
56        if b == b'<' && idx + 1 < bytes.len() && bytes[idx + 1] != b'/' {
57            // Possible opening tag (we ignore comments / declarations
58            // here — comrak doesn't emit them at the top level).
59            if let Some(tag_end) = find_tag_end(bytes, idx) {
60                let tag_slice = &html[idx..tag_end];
61                if depth == 0 && next_line < lines.len() && is_top_level_tag(tag_slice) {
62                    out.push_str(&inject_attribute(tag_slice, lines[next_line]));
63                    next_line += 1;
64                } else {
65                    out.push_str(tag_slice);
66                }
67                if !tag_slice.ends_with("/>") && !is_void_tag(tag_slice) {
68                    depth += 1;
69                }
70                idx = tag_end;
71                continue;
72            }
73        }
74        if b == b'<' && idx + 1 < bytes.len() && bytes[idx + 1] == b'/' {
75            // Closing tag.
76            if let Some(tag_end) = find_tag_end(bytes, idx) {
77                out.push_str(&html[idx..tag_end]);
78                depth = (depth - 1).max(0);
79                idx = tag_end;
80                continue;
81            }
82        }
83        out.push(b as char);
84        idx += 1;
85    }
86    out
87}
88
89fn find_tag_end(bytes: &[u8], start: usize) -> Option<usize> {
90    // Walk forward to the next '>' that is not inside an attribute
91    // value. We assume comrak's output is well-formed (no `>` inside
92    // attribute strings for the tag types we recognise).
93    let mut i = start;
94    let mut in_quote: Option<u8> = None;
95    while i < bytes.len() {
96        let c = bytes[i];
97        match in_quote {
98            None => match c {
99                b'"' | b'\'' => in_quote = Some(c),
100                b'>' => return Some(i + 1),
101                _ => {}
102            },
103            Some(q) if q == c => in_quote = None,
104            _ => {}
105        }
106        i += 1;
107    }
108    None
109}
110
111fn is_top_level_tag(tag: &str) -> bool {
112    let name = tag_name(tag);
113    matches!(
114        name,
115        "p" | "h1"
116            | "h2"
117            | "h3"
118            | "h4"
119            | "h5"
120            | "h6"
121            | "ul"
122            | "ol"
123            | "blockquote"
124            | "pre"
125            | "table"
126            | "hr"
127            | "div"
128            | "section"
129            | "details"
130    )
131}
132
133fn is_void_tag(tag: &str) -> bool {
134    // Only relevant for the depth tracker; comrak emits `<hr>` and
135    // `<br>` at top level. Comrak does not currently use the XHTML
136    // self-closing form (`<br />`) by default.
137    let name = tag_name(tag);
138    matches!(name, "hr" | "br" | "img" | "input")
139}
140
141fn tag_name(tag: &str) -> &str {
142    let body = tag.trim_start_matches('<').trim_end_matches('>');
143    let body = body.trim_start_matches('/');
144    body.split(|c: char| c.is_whitespace() || c == '>' || c == '/')
145        .next()
146        .unwrap_or("")
147}
148
149fn inject_attribute(tag: &str, line: usize) -> String {
150    if !tag.starts_with('<') {
151        return tag.to_owned();
152    }
153    // Insert `data-afm-source-line="N"` immediately after the tag
154    // name. We walk to the first whitespace, '/', or '>' to find
155    // the insertion point.
156    let bytes = tag.as_bytes();
157    let mut i = 1; // skip '<'
158    while i < bytes.len() {
159        let c = bytes[i];
160        if c == b' ' || c == b'\t' || c == b'/' || c == b'>' {
161            break;
162        }
163        i += 1;
164    }
165    let mut out = String::with_capacity(tag.len() + 28);
166    out.push_str(&tag[..i]);
167    out.push_str(" data-afm-source-line=\"");
168    out.push_str(&line.to_string());
169    out.push('"');
170    out.push_str(&tag[i..]);
171    out
172}
173
174#[cfg(test)]
175mod tests {
176    use super::*;
177
178    #[test]
179    fn injects_anchor_into_first_paragraph() {
180        let out = inject_anchors("<p>hello</p>", &[1]);
181        assert_eq!(out, r#"<p data-afm-source-line="1">hello</p>"#);
182    }
183
184    #[test]
185    fn injects_anchors_for_multiple_top_level_blocks() {
186        let out = inject_anchors("<h1>a</h1><p>b</p>", &[1, 3]);
187        assert!(out.contains(r#"<h1 data-afm-source-line="1">"#));
188        assert!(out.contains(r#"<p data-afm-source-line="3">"#));
189    }
190
191    #[test]
192    fn does_not_anchor_nested_blocks() {
193        // Only the outer <blockquote> gets the anchor, not the inner <p>.
194        let out = inject_anchors("<blockquote><p>x</p></blockquote>", &[1]);
195        assert!(out.contains(r#"<blockquote data-afm-source-line="1">"#));
196        assert!(!out.contains(r"<p data-afm-source-line="));
197    }
198
199    #[test]
200    fn no_op_when_lines_is_empty() {
201        let html = "<p>x</p>";
202        assert_eq!(inject_anchors(html, &[]), html);
203    }
204
205    #[test]
206    fn handles_void_tags_at_top_level() {
207        let out = inject_anchors("<hr><p>x</p>", &[1, 2]);
208        assert!(out.contains(r#"<hr data-afm-source-line="1">"#));
209        assert!(out.contains(r#"<p data-afm-source-line="2">"#));
210    }
211
212    #[test]
213    fn ignores_inline_tags() {
214        let out = inject_anchors("<p><strong>x</strong></p>", &[1]);
215        assert!(out.contains(r#"<p data-afm-source-line="1">"#));
216        assert!(!out.contains(r"<strong data-afm-source-line="));
217    }
218
219    #[test]
220    fn tag_name_extracts_the_lower_case_element_name() {
221        assert_eq!(tag_name("<p>"), "p");
222        assert_eq!(tag_name("<p class=\"x\">"), "p");
223        assert_eq!(tag_name("</p>"), "p");
224        assert_eq!(tag_name("<hr/>"), "hr");
225    }
226}