aozora_encoding/lib.rs
1//! Encoding utilities for Aozora Bunko source material.
2//!
3//! The `aozora` parser itself is strictly UTF-8. Anything that decodes `Shift_JIS` or
4//! resolves gaiji (外字) mappings lives here, so the parser stays free of encoding
5//! concerns and the same logic is available to CLI, editor integrations, or
6//! downstream tools.
7
8#![forbid(unsafe_code)]
9
10use encoding_rs::{DecoderResult, SHIFT_JIS};
11use miette::Diagnostic;
12use thiserror::Error;
13
14/// Errors surfaced by the decode pipeline.
15#[derive(Debug, Error, Diagnostic)]
16#[non_exhaustive]
17pub enum DecodeError {
18 #[error("Shift_JIS からの変換に失敗しました (不正なバイト列)")]
19 #[diagnostic(code(aozora::encoding::sjis_invalid))]
20 ShiftJisInvalid,
21}
22
23/// Decode a `Shift_JIS` byte slice into UTF-8 (NFC normalisation is applied by the
24/// caller after decoding).
25///
26/// # Errors
27///
28/// Returns [`DecodeError::ShiftJisInvalid`] if `encoding_rs` reports a malformed byte
29/// sequence. Lossy replacement is deliberately not offered — callers need to know
30/// when they're looking at corrupted source material rather than silently absorbing
31/// the damage.
32///
33/// Allocates a fresh `String` per call. For workloads that decode many
34/// documents in succession, prefer [`decode_sjis_into`] with a reusable
35/// buffer to avoid the per-call allocation.
36pub fn decode_sjis(input: &[u8]) -> Result<String, DecodeError> {
37 let mut out = String::new();
38 decode_sjis_into(input, &mut out)?;
39 Ok(out)
40}
41
42/// Decode a `Shift_JIS` byte slice into the caller-owned `dst` buffer.
43///
44/// Pre-sizes `dst` exactly via
45/// `encoding_rs::Decoder::max_utf8_buffer_length_without_replacement`
46/// so the decode inner loop does no growth-realloc. The buffer is
47/// **not** cleared first — callers that want a fresh decode should
48/// `dst.clear()` before calling. This is intentional so the same
49/// buffer can be reused across many decodes in a thread-local /
50/// per-worker pool without paying the allocator per iteration.
51///
52/// Strict — same error contract as [`decode_sjis`]. Bypasses
53/// `encoding_rs`'s public `decode` shape, which always allocates a
54/// worst-case-sized `String` internally and `Cow::into_owned`s the
55/// result; this entry point goes straight through the
56/// `Decoder::decode_to_string_without_replacement` API the bench
57/// pipeline needs.
58///
59/// # Errors
60///
61/// Returns [`DecodeError::ShiftJisInvalid`] on malformed input or if
62/// the encoder reports overflow (which `max_utf8_buffer_length_…`
63/// should make unreachable, but is still surfaced rather than
64/// silently truncated).
65pub fn decode_sjis_into(input: &[u8], dst: &mut String) -> Result<(), DecodeError> {
66 let mut decoder = SHIFT_JIS.new_decoder_without_bom_handling();
67 let needed = decoder
68 .max_utf8_buffer_length_without_replacement(input.len())
69 .ok_or(DecodeError::ShiftJisInvalid)?;
70 dst.reserve(needed);
71 let (result, _read) = decoder.decode_to_string_without_replacement(input, dst, true);
72 match result {
73 DecoderResult::InputEmpty => Ok(()),
74 DecoderResult::Malformed(_, _) | DecoderResult::OutputFull => {
75 Err(DecodeError::ShiftJisInvalid)
76 }
77 }
78}
79
80/// Whether the byte slice carries a UTF-8 BOM (`EF BB BF`).
81///
82/// Used by the CLI to strip the BOM before handing input to the parser. The
83/// CLI requires an explicit `--encoding` flag, so BOM presence is the only
84/// runtime signal we care about. A full encoding sniffer (BOM + byte-frequency
85/// heuristic) is intentionally out of scope until unknown-encoding input
86/// streams become a concern.
87#[must_use]
88pub const fn has_utf8_bom(input: &[u8]) -> bool {
89 matches!(input, [0xEF, 0xBB, 0xBF, ..])
90}
91
92pub mod gaiji;
93/// PHF tables (single, combo, description) emitted by `build.rs`
94/// at compile time via `phf_codegen`. Lives in `OUT_DIR` so it's
95/// regenerated automatically when any input TSV changes; the
96/// committed source tree carries only the data, not the perfect-
97/// hash output. See `build.rs` for the generator.
98#[allow(
99 clippy::unreadable_literal,
100 reason = "phf_codegen emits 64-bit perfect-hash keys without separators; \
101 we cannot reformat them without forking the codegen crate"
102)]
103mod jisx0213_table {
104 include!(concat!(env!("OUT_DIR"), "/jisx0213_table.rs"));
105}
106
107#[cfg(test)]
108mod tests {
109 use super::*;
110
111 // ------------------------------------------------------------------
112 // SJIS happy-path decoding
113 // ------------------------------------------------------------------
114
115 #[test]
116 fn decodes_plain_ascii_sjis() {
117 assert_eq!(decode_sjis(b"hello").unwrap(), "hello");
118 }
119
120 #[test]
121 fn decodes_japanese_sjis() {
122 // 「青空文庫」 in Shift_JIS.
123 let bytes = &[0x90, 0xC2, 0x8B, 0xF3, 0x95, 0xB6, 0x8C, 0xC9];
124 assert_eq!(decode_sjis(bytes).unwrap(), "青空文庫");
125 }
126
127 #[test]
128 fn decodes_empty_input_to_empty_string() {
129 assert_eq!(decode_sjis(b"").unwrap(), "");
130 }
131
132 #[test]
133 fn decodes_ascii_control_characters_verbatim() {
134 // LF / CR / tab are 1:1 identity in SJIS since the lead byte
135 // range avoids ASCII. Exercising these locks in the pipeline
136 // doesn't mangle them before the sanitize pass.
137 assert_eq!(decode_sjis(b"a\nb\rc\td").unwrap(), "a\nb\rc\td");
138 }
139
140 #[test]
141 fn decodes_halfwidth_katakana() {
142 // Halfwidth katakana (0xA1..=0xDF) is a single byte each in SJIS.
143 // `アイウエオ` → bytes 0xB1..0xB5.
144 let bytes = &[0xB1, 0xB2, 0xB3, 0xB4, 0xB5];
145 assert_eq!(decode_sjis(bytes).unwrap(), "アイウエオ");
146 }
147
148 #[test]
149 fn decodes_mixed_ascii_and_kanji() {
150 // Common shape in Aozora corpora: explanatory text in ASCII
151 // mixed with Japanese quotations.
152 let mut bytes = Vec::from(*b"about ");
153 bytes.extend_from_slice(&[0x93, 0xFA, 0x96, 0x7B]); // 日本
154 bytes.extend_from_slice(b" !");
155 assert_eq!(decode_sjis(&bytes).unwrap(), "about 日本 !");
156 }
157
158 #[test]
159 fn decodes_hiragana_sjis() {
160 // 「こんにちは」 — lead bytes in the 0x82 range.
161 let bytes = &[
162 0x82, 0xB1, // こ
163 0x82, 0xF1, // ん
164 0x82, 0xC9, // に
165 0x82, 0xBF, // ち
166 0x82, 0xCD, // は
167 ];
168 assert_eq!(decode_sjis(bytes).unwrap(), "こんにちは");
169 }
170
171 #[test]
172 fn decodes_fullwidth_digits() {
173 // 123 — fullwidth digits are common in Aozora ruby delimiters.
174 let bytes = &[0x82, 0x4F, 0x82, 0x50, 0x82, 0x51];
175 assert_eq!(decode_sjis(bytes).unwrap(), "012");
176 }
177
178 // ------------------------------------------------------------------
179 // decode_sjis_into — buffer-reuse path equivalence
180 // ------------------------------------------------------------------
181 //
182 // Every test below the section header verifies the contract that
183 // `decode_sjis(b) == decode_sjis_into(b, &mut buf)` byte-for-byte
184 // (and for the strict-error case, returns the same `Err`).
185 // `decode_sjis_into` is the buffer-reuse entry point used by the
186 // bench `parallel_size_bands` thread-local pool; the production
187 // `decode_sjis` is a thin wrapper that calls `decode_sjis_into`
188 // with a fresh `String`.
189
190 fn check_equivalent(input: &[u8]) {
191 let owned = decode_sjis(input);
192 let mut buf = String::new();
193 let into_result = decode_sjis_into(input, &mut buf);
194 match (owned, into_result) {
195 (Ok(s), Ok(())) => assert_eq!(s, buf, "decode_sjis output != decode_sjis_into output"),
196 (Err(_), Err(_)) => {} // both fail — identical strict error contract
197 (Ok(s), Err(e)) => panic!("owned succeeded ({s:?}) but _into failed ({e:?})"),
198 (Err(e), Ok(())) => panic!("owned failed ({e:?}) but _into succeeded ({buf:?})"),
199 }
200 }
201
202 #[test]
203 fn into_equivalent_on_ascii() {
204 check_equivalent(b"hello world");
205 }
206
207 #[test]
208 fn into_equivalent_on_japanese() {
209 check_equivalent(&[0x90, 0xC2, 0x8B, 0xF3, 0x95, 0xB6, 0x8C, 0xC9]);
210 }
211
212 #[test]
213 fn into_equivalent_on_empty() {
214 check_equivalent(b"");
215 }
216
217 #[test]
218 fn into_equivalent_on_halfwidth_katakana() {
219 check_equivalent(&[0xB1, 0xB2, 0xB3, 0xB4, 0xB5]);
220 }
221
222 #[test]
223 fn into_equivalent_on_invalid_lead_byte() {
224 check_equivalent(&[0xFF, 0xFF]);
225 }
226
227 #[test]
228 fn into_equivalent_on_lone_lead_byte() {
229 check_equivalent(&[b'o', b'k', 0x82]);
230 }
231
232 #[test]
233 fn into_reuses_buffer_capacity_across_calls() {
234 // The buffer-reuse contract: a `dst` String that already has
235 // enough capacity should not allocate again on the second
236 // decode. We verify this by asserting capacity is preserved
237 // across `clear() + decode_sjis_into` cycles. (Pinning the
238 // exact byte count would couple the test to bumpalo /
239 // encoding_rs internals; the load-bearing invariant is "no
240 // shrink".)
241 let mut buf = String::with_capacity(4096);
242 let cap_before = buf.capacity();
243 decode_sjis_into(b"hello", &mut buf).unwrap();
244 let cap_after_first = buf.capacity();
245 assert!(
246 cap_after_first >= cap_before,
247 "capacity must not shrink on small decode"
248 );
249 buf.clear();
250 decode_sjis_into(b"world", &mut buf).unwrap();
251 assert!(
252 buf.capacity() >= cap_after_first,
253 "capacity must not shrink on a buffer-reuse cycle"
254 );
255 }
256
257 #[test]
258 fn into_appends_when_dst_not_cleared() {
259 // Documented contract: callers must `clear()` before each
260 // decode if they want a fresh result. This test pins that
261 // shape so future "convenience clear inside the function"
262 // changes break loudly.
263 let mut buf = String::from("PRE:");
264 decode_sjis_into(b"hi", &mut buf).unwrap();
265 assert_eq!(buf, "PRE:hi");
266 }
267
268 // ------------------------------------------------------------------
269 // SJIS error surfaces
270 // ------------------------------------------------------------------
271
272 #[test]
273 fn rejects_invalid_lead_byte() {
274 let bytes = &[0xFF, 0xFF];
275 assert!(matches!(
276 decode_sjis(bytes),
277 Err(DecodeError::ShiftJisInvalid)
278 ));
279 }
280
281 #[test]
282 fn rejects_lone_lead_byte_at_end_of_input() {
283 // 0x82 alone is a truncated two-byte sequence (expects trail).
284 let bytes = &[b'o', b'k', 0x82];
285 assert!(matches!(
286 decode_sjis(bytes),
287 Err(DecodeError::ShiftJisInvalid)
288 ));
289 }
290
291 #[test]
292 fn rejects_invalid_trail_byte() {
293 // Lead 0x82 with an invalid trail 0x00 (trails must be 0x40..=0xFC, != 0x7F).
294 let bytes = &[0x82, 0x00];
295 assert!(matches!(
296 decode_sjis(bytes),
297 Err(DecodeError::ShiftJisInvalid)
298 ));
299 }
300
301 #[test]
302 fn error_message_is_japanese_and_carries_miette_code() {
303 // The project-wide rule is that user-facing errors are in
304 // Japanese. Pin that and the miette diagnostic code both.
305 let err = decode_sjis(&[0xFF, 0xFF]).unwrap_err();
306 let message = format!("{err}");
307 assert!(
308 message.contains("Shift_JIS"),
309 "error message must contain Shift_JIS for locatability, got {message:?}",
310 );
311 }
312
313 // ------------------------------------------------------------------
314 // UTF-8 BOM detection
315 // ------------------------------------------------------------------
316
317 #[test]
318 fn detects_utf8_bom() {
319 assert!(has_utf8_bom(b"\xEF\xBB\xBFtext"));
320 }
321
322 #[test]
323 fn no_utf8_bom_on_plain_input() {
324 assert!(!has_utf8_bom(b"text"));
325 }
326
327 #[test]
328 fn no_utf8_bom_on_shorter_than_bom() {
329 assert!(!has_utf8_bom(b"\xEF\xBB"));
330 }
331
332 #[test]
333 fn no_utf8_bom_on_empty_input() {
334 assert!(!has_utf8_bom(b""));
335 }
336
337 #[test]
338 fn detects_utf8_bom_on_exactly_three_bytes() {
339 // Boundary: the slice is exactly `EF BB BF` with no trailing
340 // content. `matches!` pattern with `..` rest binding accepts
341 // empty tails.
342 assert!(has_utf8_bom(&[0xEF, 0xBB, 0xBF]));
343 }
344
345 #[test]
346 fn bom_detection_rejects_near_misses() {
347 // Off-by-one patterns that are NOT the UTF-8 BOM.
348 assert!(!has_utf8_bom(&[0xEF, 0xBB, 0xBE])); // last byte wrong
349 assert!(!has_utf8_bom(&[0xEE, 0xBB, 0xBF])); // first byte wrong
350 assert!(!has_utf8_bom(&[0xEF, 0xBC, 0xBF])); // middle byte wrong
351 assert!(!has_utf8_bom(&[0xFE, 0xFF])); // UTF-16 BE BOM — not ours
352 assert!(!has_utf8_bom(&[0xFF, 0xFE])); // UTF-16 LE BOM — not ours
353 }
354
355 // ------------------------------------------------------------------
356 // Gaiji resolution (via primitive `gaiji::lookup`)
357 // ------------------------------------------------------------------
358
359 #[test]
360 fn gaiji_lookup_echoes_existing_ucs_when_set() {
361 assert_eq!(
362 gaiji::lookup(Some('吶'), Some("第3水準1-85-54"), "木+吶のつくり"),
363 Some(gaiji::Resolved::Char('吶'))
364 );
365 }
366
367 #[test]
368 fn gaiji_lookup_returns_none_when_unresolvable() {
369 assert_eq!(gaiji::lookup(None, None, "第3水準1-85-54"), None);
370 }
371}