aozora_encoding/
lib.rs

1//! Encoding utilities for Aozora Bunko source material.
2//!
3//! The `aozora` parser itself is strictly UTF-8. Anything that decodes `Shift_JIS` or
4//! resolves gaiji (外字) mappings lives here, so the parser stays free of encoding
5//! concerns and the same logic is available to CLI, editor integrations, or
6//! downstream tools.
7
8#![forbid(unsafe_code)]
9
10use encoding_rs::{DecoderResult, SHIFT_JIS};
11use miette::Diagnostic;
12use thiserror::Error;
13
14/// Errors surfaced by the decode pipeline.
15#[derive(Debug, Error, Diagnostic)]
16#[non_exhaustive]
17pub enum DecodeError {
18    #[error("Shift_JIS からの変換に失敗しました (不正なバイト列)")]
19    #[diagnostic(code(aozora::encoding::sjis_invalid))]
20    ShiftJisInvalid,
21}
22
23/// Decode a `Shift_JIS` byte slice into UTF-8 (NFC normalisation is applied by the
24/// caller after decoding).
25///
26/// # Errors
27///
28/// Returns [`DecodeError::ShiftJisInvalid`] if `encoding_rs` reports a malformed byte
29/// sequence. Lossy replacement is deliberately not offered — callers need to know
30/// when they're looking at corrupted source material rather than silently absorbing
31/// the damage.
32///
33/// Allocates a fresh `String` per call. For workloads that decode many
34/// documents in succession, prefer [`decode_sjis_into`] with a reusable
35/// buffer to avoid the per-call allocation.
36pub fn decode_sjis(input: &[u8]) -> Result<String, DecodeError> {
37    let mut out = String::new();
38    decode_sjis_into(input, &mut out)?;
39    Ok(out)
40}
41
42/// Decode a `Shift_JIS` byte slice into the caller-owned `dst` buffer.
43///
44/// Pre-sizes `dst` exactly via
45/// `encoding_rs::Decoder::max_utf8_buffer_length_without_replacement`
46/// so the decode inner loop does no growth-realloc. The buffer is
47/// **not** cleared first — callers that want a fresh decode should
48/// `dst.clear()` before calling. This is intentional so the same
49/// buffer can be reused across many decodes in a thread-local /
50/// per-worker pool without paying the allocator per iteration.
51///
52/// Strict — same error contract as [`decode_sjis`]. Bypasses
53/// `encoding_rs`'s public `decode` shape, which always allocates a
54/// worst-case-sized `String` internally and `Cow::into_owned`s the
55/// result; this entry point goes straight through the
56/// `Decoder::decode_to_string_without_replacement` API the bench
57/// pipeline needs.
58///
59/// # Errors
60///
61/// Returns [`DecodeError::ShiftJisInvalid`] on malformed input or if
62/// the encoder reports overflow (which `max_utf8_buffer_length_…`
63/// should make unreachable, but is still surfaced rather than
64/// silently truncated).
65pub fn decode_sjis_into(input: &[u8], dst: &mut String) -> Result<(), DecodeError> {
66    let mut decoder = SHIFT_JIS.new_decoder_without_bom_handling();
67    let needed = decoder
68        .max_utf8_buffer_length_without_replacement(input.len())
69        .ok_or(DecodeError::ShiftJisInvalid)?;
70    dst.reserve(needed);
71    let (result, _read) = decoder.decode_to_string_without_replacement(input, dst, true);
72    match result {
73        DecoderResult::InputEmpty => Ok(()),
74        DecoderResult::Malformed(_, _) | DecoderResult::OutputFull => {
75            Err(DecodeError::ShiftJisInvalid)
76        }
77    }
78}
79
80/// Whether the byte slice carries a UTF-8 BOM (`EF BB BF`).
81///
82/// Used by the CLI to strip the BOM before handing input to the parser. The
83/// CLI requires an explicit `--encoding` flag, so BOM presence is the only
84/// runtime signal we care about. A full encoding sniffer (BOM + byte-frequency
85/// heuristic) is intentionally out of scope until unknown-encoding input
86/// streams become a concern.
87#[must_use]
88pub const fn has_utf8_bom(input: &[u8]) -> bool {
89    matches!(input, [0xEF, 0xBB, 0xBF, ..])
90}
91
92pub mod gaiji;
93/// PHF tables (single, combo, description) emitted by `build.rs`
94/// at compile time via `phf_codegen`. Lives in `OUT_DIR` so it's
95/// regenerated automatically when any input TSV changes; the
96/// committed source tree carries only the data, not the perfect-
97/// hash output. See `build.rs` for the generator.
98#[allow(
99    clippy::unreadable_literal,
100    reason = "phf_codegen emits 64-bit perfect-hash keys without separators; \
101              we cannot reformat them without forking the codegen crate"
102)]
103mod jisx0213_table {
104    include!(concat!(env!("OUT_DIR"), "/jisx0213_table.rs"));
105}
106
107#[cfg(test)]
108mod tests {
109    use super::*;
110
111    // ------------------------------------------------------------------
112    // SJIS happy-path decoding
113    // ------------------------------------------------------------------
114
115    #[test]
116    fn decodes_plain_ascii_sjis() {
117        assert_eq!(decode_sjis(b"hello").unwrap(), "hello");
118    }
119
120    #[test]
121    fn decodes_japanese_sjis() {
122        // 「青空文庫」 in Shift_JIS.
123        let bytes = &[0x90, 0xC2, 0x8B, 0xF3, 0x95, 0xB6, 0x8C, 0xC9];
124        assert_eq!(decode_sjis(bytes).unwrap(), "青空文庫");
125    }
126
127    #[test]
128    fn decodes_empty_input_to_empty_string() {
129        assert_eq!(decode_sjis(b"").unwrap(), "");
130    }
131
132    #[test]
133    fn decodes_ascii_control_characters_verbatim() {
134        // LF / CR / tab are 1:1 identity in SJIS since the lead byte
135        // range avoids ASCII. Exercising these locks in the pipeline
136        // doesn't mangle them before the sanitize pass.
137        assert_eq!(decode_sjis(b"a\nb\rc\td").unwrap(), "a\nb\rc\td");
138    }
139
140    #[test]
141    fn decodes_halfwidth_katakana() {
142        // Halfwidth katakana (0xA1..=0xDF) is a single byte each in SJIS.
143        // `ｱｲｳｴｵ` → bytes 0xB1..0xB5.
144        let bytes = &[0xB1, 0xB2, 0xB3, 0xB4, 0xB5];
145        assert_eq!(decode_sjis(bytes).unwrap(), "ｱｲｳｴｵ");
146    }
147
148    #[test]
149    fn decodes_mixed_ascii_and_kanji() {
150        // Common shape in Aozora corpora: explanatory text in ASCII
151        // mixed with Japanese quotations.
152        let mut bytes = Vec::from(*b"about ");
153        bytes.extend_from_slice(&[0x93, 0xFA, 0x96, 0x7B]); // 日本
154        bytes.extend_from_slice(b" !");
155        assert_eq!(decode_sjis(&bytes).unwrap(), "about 日本 !");
156    }
157
158    #[test]
159    fn decodes_hiragana_sjis() {
160        // 「こんにちは」 — lead bytes in the 0x82 range.
161        let bytes = &[
162            0x82, 0xB1, // こ
163            0x82, 0xF1, // ん
164            0x82, 0xC9, // に
165            0x82, 0xBF, // ち
166            0x82, 0xCD, // は
167        ];
168        assert_eq!(decode_sjis(bytes).unwrap(), "こんにちは");
169    }
170
171    #[test]
172    fn decodes_fullwidth_digits() {
173        // １２３ — fullwidth digits are common in Aozora ruby delimiters.
174        let bytes = &[0x82, 0x4F, 0x82, 0x50, 0x82, 0x51];
175        assert_eq!(decode_sjis(bytes).unwrap(), "０１２");
176    }
177
178    // ------------------------------------------------------------------
179    // decode_sjis_into — buffer-reuse path equivalence
180    // ------------------------------------------------------------------
181    //
182    // Every test below the section header verifies the contract that
183    // `decode_sjis(b) == decode_sjis_into(b, &mut buf)` byte-for-byte
184    // (and for the strict-error case, returns the same `Err`).
185    // `decode_sjis_into` is the buffer-reuse entry point used by the
186    // bench `parallel_size_bands` thread-local pool; the production
187    // `decode_sjis` is a thin wrapper that calls `decode_sjis_into`
188    // with a fresh `String`.
189
190    fn check_equivalent(input: &[u8]) {
191        let owned = decode_sjis(input);
192        let mut buf = String::new();
193        let into_result = decode_sjis_into(input, &mut buf);
194        match (owned, into_result) {
195            (Ok(s), Ok(())) => assert_eq!(s, buf, "decode_sjis output != decode_sjis_into output"),
196            (Err(_), Err(_)) => {} // both fail — identical strict error contract
197            (Ok(s), Err(e)) => panic!("owned succeeded ({s:?}) but _into failed ({e:?})"),
198            (Err(e), Ok(())) => panic!("owned failed ({e:?}) but _into succeeded ({buf:?})"),
199        }
200    }
201
202    #[test]
203    fn into_equivalent_on_ascii() {
204        check_equivalent(b"hello world");
205    }
206
207    #[test]
208    fn into_equivalent_on_japanese() {
209        check_equivalent(&[0x90, 0xC2, 0x8B, 0xF3, 0x95, 0xB6, 0x8C, 0xC9]);
210    }
211
212    #[test]
213    fn into_equivalent_on_empty() {
214        check_equivalent(b"");
215    }
216
217    #[test]
218    fn into_equivalent_on_halfwidth_katakana() {
219        check_equivalent(&[0xB1, 0xB2, 0xB3, 0xB4, 0xB5]);
220    }
221
222    #[test]
223    fn into_equivalent_on_invalid_lead_byte() {
224        check_equivalent(&[0xFF, 0xFF]);
225    }
226
227    #[test]
228    fn into_equivalent_on_lone_lead_byte() {
229        check_equivalent(&[b'o', b'k', 0x82]);
230    }
231
232    #[test]
233    fn into_reuses_buffer_capacity_across_calls() {
234        // The buffer-reuse contract: a `dst` String that already has
235        // enough capacity should not allocate again on the second
236        // decode. We verify this by asserting capacity is preserved
237        // across `clear() + decode_sjis_into` cycles. (Pinning the
238        // exact byte count would couple the test to bumpalo /
239        // encoding_rs internals; the load-bearing invariant is "no
240        // shrink".)
241        let mut buf = String::with_capacity(4096);
242        let cap_before = buf.capacity();
243        decode_sjis_into(b"hello", &mut buf).unwrap();
244        let cap_after_first = buf.capacity();
245        assert!(
246            cap_after_first >= cap_before,
247            "capacity must not shrink on small decode"
248        );
249        buf.clear();
250        decode_sjis_into(b"world", &mut buf).unwrap();
251        assert!(
252            buf.capacity() >= cap_after_first,
253            "capacity must not shrink on a buffer-reuse cycle"
254        );
255    }
256
257    #[test]
258    fn into_appends_when_dst_not_cleared() {
259        // Documented contract: callers must `clear()` before each
260        // decode if they want a fresh result. This test pins that
261        // shape so future "convenience clear inside the function"
262        // changes break loudly.
263        let mut buf = String::from("PRE:");
264        decode_sjis_into(b"hi", &mut buf).unwrap();
265        assert_eq!(buf, "PRE:hi");
266    }
267
268    // ------------------------------------------------------------------
269    // SJIS error surfaces
270    // ------------------------------------------------------------------
271
272    #[test]
273    fn rejects_invalid_lead_byte() {
274        let bytes = &[0xFF, 0xFF];
275        assert!(matches!(
276            decode_sjis(bytes),
277            Err(DecodeError::ShiftJisInvalid)
278        ));
279    }
280
281    #[test]
282    fn rejects_lone_lead_byte_at_end_of_input() {
283        // 0x82 alone is a truncated two-byte sequence (expects trail).
284        let bytes = &[b'o', b'k', 0x82];
285        assert!(matches!(
286            decode_sjis(bytes),
287            Err(DecodeError::ShiftJisInvalid)
288        ));
289    }
290
291    #[test]
292    fn rejects_invalid_trail_byte() {
293        // Lead 0x82 with an invalid trail 0x00 (trails must be 0x40..=0xFC, != 0x7F).
294        let bytes = &[0x82, 0x00];
295        assert!(matches!(
296            decode_sjis(bytes),
297            Err(DecodeError::ShiftJisInvalid)
298        ));
299    }
300
301    #[test]
302    fn error_message_is_japanese_and_carries_miette_code() {
303        // The project-wide rule is that user-facing errors are in
304        // Japanese. Pin that and the miette diagnostic code both.
305        let err = decode_sjis(&[0xFF, 0xFF]).unwrap_err();
306        let message = format!("{err}");
307        assert!(
308            message.contains("Shift_JIS"),
309            "error message must contain Shift_JIS for locatability, got {message:?}",
310        );
311    }
312
313    // ------------------------------------------------------------------
314    // UTF-8 BOM detection
315    // ------------------------------------------------------------------
316
317    #[test]
318    fn detects_utf8_bom() {
319        assert!(has_utf8_bom(b"\xEF\xBB\xBFtext"));
320    }
321
322    #[test]
323    fn no_utf8_bom_on_plain_input() {
324        assert!(!has_utf8_bom(b"text"));
325    }
326
327    #[test]
328    fn no_utf8_bom_on_shorter_than_bom() {
329        assert!(!has_utf8_bom(b"\xEF\xBB"));
330    }
331
332    #[test]
333    fn no_utf8_bom_on_empty_input() {
334        assert!(!has_utf8_bom(b""));
335    }
336
337    #[test]
338    fn detects_utf8_bom_on_exactly_three_bytes() {
339        // Boundary: the slice is exactly `EF BB BF` with no trailing
340        // content. `matches!` pattern with `..` rest binding accepts
341        // empty tails.
342        assert!(has_utf8_bom(&[0xEF, 0xBB, 0xBF]));
343    }
344
345    #[test]
346    fn bom_detection_rejects_near_misses() {
347        // Off-by-one patterns that are NOT the UTF-8 BOM.
348        assert!(!has_utf8_bom(&[0xEF, 0xBB, 0xBE])); // last byte wrong
349        assert!(!has_utf8_bom(&[0xEE, 0xBB, 0xBF])); // first byte wrong
350        assert!(!has_utf8_bom(&[0xEF, 0xBC, 0xBF])); // middle byte wrong
351        assert!(!has_utf8_bom(&[0xFE, 0xFF])); // UTF-16 BE BOM — not ours
352        assert!(!has_utf8_bom(&[0xFF, 0xFE])); // UTF-16 LE BOM — not ours
353    }
354
355    // ------------------------------------------------------------------
356    // Gaiji resolution (via primitive `gaiji::lookup`)
357    // ------------------------------------------------------------------
358
359    #[test]
360    fn gaiji_lookup_echoes_existing_ucs_when_set() {
361        assert_eq!(
362            gaiji::lookup(Some('吶'), Some("第3水準1-85-54"), "木＋吶のつくり"),
363            Some(gaiji::Resolved::Char('吶'))
364        );
365    }
366
367    #[test]
368    fn gaiji_lookup_returns_none_when_unresolvable() {
369        assert_eq!(gaiji::lookup(None, None, "第3水準1-85-54"), None);
370    }
371}
aozora_encoding/lib.rs

aozora_encoding/
lib.rs