#![forbid(unsafe_code)] //! Licensed corpus discovery and aggregate reports. use fparkan_binary::{sha256, sha256_hex, Sha256Digest}; use fparkan_path::{ascii_lookup_key, normalize_relative, PathPolicy}; use std::collections::{BTreeMap, BTreeSet}; use std::fmt; use std::fs; use std::io::Write; use std::path::{Path, PathBuf}; use std::sync::Arc; /// Corpus kind. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum CorpusKind { /// Demo corpus. Demo, /// Part 1 full game. Part1, /// Part 2 full game. Part2, /// Unknown local directory. Unknown, } /// Corpus root. #[derive(Clone, Debug, Eq, PartialEq)] pub struct CorpusRoot(pub PathBuf); /// Discovery options. #[derive(Clone, Copy, Debug, Default)] pub struct DiscoverOptions { /// Whether symlinks may be traversed. pub follow_symlinks: bool, } /// File manifest entry. #[derive(Clone, Debug, Eq, PartialEq)] pub struct ManifestEntry { /// Normalized relative path. pub path: String, /// File size in bytes. pub size: u64, /// SHA-256 content fingerprint. pub hash: Sha256Digest, } /// Corpus manifest. #[derive(Clone, Debug, Eq, PartialEq)] pub struct CorpusManifest { /// Kind. pub kind: CorpusKind, /// Sorted files. pub files: Vec, /// Casefold collisions. pub casefold_collisions: Vec>, } /// Aggregate report. #[derive(Clone, Debug, Eq, PartialEq)] pub struct CorpusReport { /// Schema version. pub schema: u32, /// Kind. pub kind: CorpusKind, /// Total files. pub files: usize, /// Total bytes. pub bytes: u64, /// Metrics. pub metrics: BTreeMap, /// Casefold collision count. pub casefold_collisions: usize, /// Manifest fingerprint. pub fingerprint: Sha256Digest, /// Per-file status records. pub records: Vec, /// Number of files with report errors. pub failures: usize, } /// Per-file report status. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum CorpusFileStatus { /// File was inspected successfully. Ok, /// File was inspected but produced a non-fatal warning. Warning, /// File could not be inspected. Error, } /// Per-file report record. #[derive(Clone, Debug, Eq, PartialEq)] pub struct CorpusFileRecord { /// Normalized relative path. pub path: String, /// Inspection status. pub status: CorpusFileStatus, /// Detected file variant. pub variant: String, /// Optional status message. pub message: Option, } /// Corpus error. #[derive(Debug)] pub enum CorpusError { /// I/O failure. Io { /// Path where I/O failed. path: PathBuf, /// Source error. source: std::io::Error, }, /// Invalid root. InvalidRoot(PathBuf), /// Invalid path. InvalidPath(String), /// Aggregate report failure. Report { /// Path where reporting failed. path: String, /// Failure message. message: String, }, } impl fmt::Display for CorpusError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { Self::Io { path, source } => write!(f, "{}: {source}", path.display()), Self::InvalidRoot(path) => write!(f, "invalid corpus root: {}", path.display()), Self::InvalidPath(path) => write!(f, "invalid corpus path: {path}"), Self::Report { path, message } => write!(f, "{path}: {message}"), } } } impl std::error::Error for CorpusError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self { Self::Io { source, .. } => Some(source), Self::InvalidRoot(_) | Self::InvalidPath(_) | Self::Report { .. } => None, } } } /// Discovers a corpus under a root directory. /// /// # Errors /// /// Returns [`CorpusError`] if the root is invalid, traversal encounters an I/O /// error, or a discovered path cannot be represented by the legacy path policy. pub fn discover(root: &Path, options: DiscoverOptions) -> Result { if !root.is_dir() { return Err(CorpusError::InvalidRoot(root.to_path_buf())); } let mut files = Vec::new(); walk(root, root, options, &mut files)?; files.sort_by(|a, b| a.path.cmp(&b.path)); let kind = classify(root, &files); let casefold_collisions = detect_casefold_collisions(&files); Ok(CorpusManifest { kind, files, casefold_collisions, }) } fn walk( root: &Path, dir: &Path, options: DiscoverOptions, out: &mut Vec, ) -> Result<(), CorpusError> { let read_dir = fs::read_dir(dir).map_err(|source| CorpusError::Io { path: dir.to_path_buf(), source, })?; let mut entries = Vec::new(); for entry in read_dir { let entry = entry.map_err(|source| CorpusError::Io { path: dir.to_path_buf(), source, })?; entries.push(entry.path()); } entries.sort(); for path in entries { if path .file_name() .and_then(|name| name.to_str()) .is_some_and(|name| name.starts_with('.')) { continue; } let metadata = fs::symlink_metadata(&path).map_err(|source| CorpusError::Io { path: path.clone(), source, })?; if metadata.file_type().is_symlink() && !options.follow_symlinks { continue; } if metadata.is_dir() { walk(root, &path, options, out)?; continue; } if !metadata.is_file() { continue; } let rel = path .strip_prefix(root) .map_err(|_| CorpusError::InvalidPath(path.display().to_string()))?; let rel_text = rel .to_str() .ok_or_else(|| CorpusError::InvalidPath(path.display().to_string()))?; let normalized = normalize_relative(rel_text.as_bytes(), PathPolicy::HostCompatible) .map_err(|_| CorpusError::InvalidPath(rel_text.to_string()))?; let bytes = fs::read(&path).map_err(|source| CorpusError::Io { path: path.clone(), source, })?; out.push(ManifestEntry { path: normalized.as_str().to_string(), size: metadata.len(), hash: sha256(&bytes), }); } Ok(()) } fn classify(root: &Path, files: &[ManifestEntry]) -> CorpusKind { let name = root .file_name() .and_then(|v| v.to_str()) .unwrap_or_default() .to_ascii_uppercase(); if name == "IS" { CorpusKind::Part1 } else if name == "IS2" { CorpusKind::Part2 } else if files .iter() .any(|f| f.path.eq_ignore_ascii_case("iron_3d.exe")) { CorpusKind::Part1 } else { CorpusKind::Unknown } } fn detect_casefold_collisions(files: &[ManifestEntry]) -> Vec> { let mut grouped: BTreeMap, BTreeSet> = BTreeMap::new(); for file in files { grouped .entry(ascii_lookup_key(file.path.as_bytes()).0) .or_default() .insert(file.path.clone()); } grouped .into_values() .filter(|paths| paths.len() > 1) .map(|paths| paths.into_iter().collect()) .collect() } /// Builds aggregate report. /// /// # Errors /// /// Returns [`CorpusError`] when the aggregate report cannot be constructed. /// Per-file inspection failures are represented in [`CorpusReport::records`] /// and counted in [`CorpusReport::failures`]. pub fn report(root: &Path, manifest: &CorpusManifest) -> Result { let mut metrics = empty_report_metrics(); let mut records = Vec::with_capacity(manifest.files.len()); let mut failures = 0usize; for entry in &manifest.files { let record = inspect_report_file(root, entry, &mut metrics); if record.status == CorpusFileStatus::Error { failures = failures.saturating_add(1); } records.push(record); } Ok(CorpusReport { schema: 1, kind: manifest.kind, files: manifest.files.len(), bytes: manifest.files.iter().map(|f| f.size).sum(), metrics, casefold_collisions: manifest.casefold_collisions.len(), fingerprint: fingerprint(manifest), records, failures, }) } fn empty_report_metrics() -> BTreeMap { let mut metrics = BTreeMap::new(); metrics.insert("nres_files".to_string(), 0); metrics.insert("nres_entries".to_string(), 0); metrics.insert("rsli_files".to_string(), 0); metrics.insert("tma_files".to_string(), 0); metrics.insert("land_msh_files".to_string(), 0); metrics.insert("land_map_files".to_string(), 0); metrics.insert("unit_dat_files".to_string(), 0); metrics.insert("msh_entries".to_string(), 0); metrics.insert("mat0_entries".to_string(), 0); metrics.insert("texm_entries".to_string(), 0); metrics.insert("fxid_entries".to_string(), 0); metrics.insert("wear_entries".to_string(), 0); metrics } fn inspect_report_file( root: &Path, entry: &ManifestEntry, metrics: &mut BTreeMap, ) -> CorpusFileRecord { let lower = entry.path.to_ascii_lowercase(); let mut variant = inspect_path_metrics(&lower, metrics); let path = root.join(&entry.path); let bytes = match fs::read(&path) { Ok(bytes) => bytes, Err(source) => { return CorpusFileRecord { path: entry.path.clone(), status: CorpusFileStatus::Error, variant, message: Some(source.to_string()), }; } }; if bytes.starts_with(b"NRes") { variant = "nres".to_string(); bump(metrics, "nres_files", 1); if let Err(message) = inspect_nres_metrics(bytes, metrics) { return CorpusFileRecord { path: entry.path.clone(), status: CorpusFileStatus::Error, variant, message: Some(message), }; } } else if bytes.starts_with(b"NL") { variant = "rsli".to_string(); bump(metrics, "rsli_files", 1); } CorpusFileRecord { path: entry.path.clone(), status: CorpusFileStatus::Ok, variant, message: None, } } fn inspect_path_metrics(lower: &str, metrics: &mut BTreeMap) -> String { let mut variant = "file"; if lower.ends_with("data.tma") { bump(metrics, "tma_files", 1); variant = "tma"; } if lower.ends_with("land.msh") { bump(metrics, "land_msh_files", 1); variant = "land_msh"; } if lower.ends_with("land.map") { bump(metrics, "land_map_files", 1); variant = "land_map"; } if has_extension(lower, "dat") && (lower.starts_with("units/") || lower.contains("/units/")) { bump(metrics, "unit_dat_files", 1); variant = "unit_dat"; } variant.to_string() } fn inspect_nres_metrics(bytes: Vec, metrics: &mut BTreeMap) -> Result<(), String> { let entries = inspect_nres_entries(bytes)?; bump(metrics, "nres_entries", entries.len() as u64); for entry in entries { let name = String::from_utf8_lossy(entry.name_bytes()).to_ascii_lowercase(); if has_extension(&name, "msh") { bump(metrics, "msh_entries", 1); } match entry.meta().type_id { 0x3054_414D => { bump(metrics, "mat0_entries", 1); } 0x6D78_6554 => { bump(metrics, "texm_entries", 1); } 0x4449_5846 => { bump(metrics, "fxid_entries", 1); } 0x5241_4557 => { bump(metrics, "wear_entries", 1); } _ => {} } } Ok(()) } fn bump(metrics: &mut BTreeMap, key: &str, delta: u64) { if let Some(value) = metrics.get_mut(key) { *value = value.saturating_add(delta); } } fn has_extension(path: &str, expected: &str) -> bool { Path::new(path) .extension() .is_some_and(|extension| extension.eq_ignore_ascii_case(expected)) } fn inspect_nres_entries(bytes: Vec) -> Result, String> { let document = fparkan_nres::decode( Arc::from(bytes.into_boxed_slice()), fparkan_nres::ReadProfile::Compatible, ) .map_err(|err| err.to_string())?; Ok(document.entries().to_vec()) } /// Computes stable manifest fingerprint. #[must_use] pub fn fingerprint(manifest: &CorpusManifest) -> Sha256Digest { let mut bytes = Vec::new(); for file in &manifest.files { bytes.extend_from_slice(file.path.as_bytes()); bytes.push(0); bytes.extend_from_slice(&file.size.to_le_bytes()); bytes.extend_from_slice(&file.hash); } sha256(&bytes) } /// Writes report atomically. /// /// # Errors /// /// Returns [`CorpusError`] if the parent directory, temporary file, write, or /// final rename operation fails. pub fn write_report_atomic(path: &Path, report: &CorpusReport) -> Result<(), CorpusError> { let tmp = path.with_extension("tmp"); if let Some(parent) = path.parent() { fs::create_dir_all(parent).map_err(|source| CorpusError::Io { path: parent.to_path_buf(), source, })?; } let mut file = fs::File::create(&tmp).map_err(|source| CorpusError::Io { path: tmp.clone(), source, })?; file.write_all(render_report_json(report).as_bytes()) .map_err(|source| CorpusError::Io { path: tmp.clone(), source, })?; file.sync_all().map_err(|source| CorpusError::Io { path: tmp.clone(), source, })?; fs::rename(&tmp, path).map_err(|source| CorpusError::Io { path: path.to_path_buf(), source, })?; Ok(()) } /// Renders report JSON. #[must_use] pub fn render_report_json(report: &CorpusReport) -> String { let mut out = format!( "{{\"schema_version\":\"fparkan-corpus-report-v1\",\"schema\":{},\"kind\":\"{:?}\",\"files\":{},\"bytes\":{},\"casefold_collisions\":{},\"fingerprint\":\"{}\",\"failures\":{},\"record_count\":{},\"metrics\":{{", report.schema, report.kind, report.files, report.bytes, report.casefold_collisions, sha256_hex(&report.fingerprint), report.failures, report.records.len() ); for (idx, (key, value)) in report.metrics.iter().enumerate() { if idx > 0 { out.push(','); } out.push('"'); out.push_str(key); out.push_str("\":"); out.push_str(&value.to_string()); } out.push_str("}}"); out.push('}'); out } #[cfg(test)] mod tests { use super::*; use fparkan_path::join_under; use std::time::{SystemTime, UNIX_EPOCH}; #[test] #[ignore = "requires licensed corpus"] fn report_for_testdata_roots() { let root = licensed_root("IS"); let manifest = discover(&root, DiscoverOptions::default()).expect("manifest"); let report = report(&root, &manifest).expect("report"); assert!(report.files > 0); assert!(report.metrics["nres_files"] > 0); } #[test] #[ignore = "requires licensed corpus"] fn licensed_part1_manifest_profile_and_counts_match_baseline() { let root = testdata_root("IS"); let manifest = discover(&root, DiscoverOptions::default()).expect("part 1 manifest"); let report = report(&root, &manifest).expect("report"); assert_eq!(manifest.kind, CorpusKind::Part1); assert_eq!(report.files, 1_017); assert_eq!(report.metrics["nres_files"], 120); assert_eq!(report.metrics["rsli_files"], 2); assert_eq!(report.metrics["tma_files"], 29); assert_eq!(report.metrics["land_msh_files"], 33); assert_eq!(report.metrics["land_map_files"], 33); assert_eq!(report.metrics["unit_dat_files"], 425); } #[test] #[ignore = "requires licensed corpus"] fn licensed_part2_manifest_profile_and_counts_match_baseline() { let root = testdata_root("IS2"); let manifest = discover(&root, DiscoverOptions::default()).expect("part 2 manifest"); let report = report(&root, &manifest).expect("report"); assert_eq!(manifest.kind, CorpusKind::Part2); assert_eq!(report.files, 1_302); assert_eq!(report.metrics["nres_files"], 134); assert_eq!(report.metrics["rsli_files"], 2); assert_eq!(report.metrics["tma_files"], 31); assert_eq!(report.metrics["land_msh_files"], 32); assert_eq!(report.metrics["land_map_files"], 32); assert_eq!(report.metrics["unit_dat_files"], 676); } #[test] #[ignore = "requires licensed corpus"] fn licensed_part1_has_no_casefold_relative_path_collisions() { let root = testdata_root("IS"); let manifest = discover(&root, DiscoverOptions::default()).expect("part 1 manifest"); assert!(manifest.casefold_collisions.is_empty()); } #[test] #[ignore = "requires licensed corpus"] fn licensed_part2_has_no_casefold_relative_path_collisions() { let root = testdata_root("IS2"); let manifest = discover(&root, DiscoverOptions::default()).expect("part 2 manifest"); assert!(manifest.casefold_collisions.is_empty()); } #[test] #[ignore = "requires licensed corpus"] fn licensed_part1_paths_stay_under_root() { assert_discovered_paths_stay_under_root("IS"); } #[test] #[ignore = "requires licensed corpus"] fn licensed_part2_paths_stay_under_root() { assert_discovered_paths_stay_under_root("IS2"); } #[test] fn report_json_contains_metrics_and_hashes_not_paths_or_payloads() { let manifest = CorpusManifest { kind: CorpusKind::Part1, files: vec![ManifestEntry { path: "secret/payload.bin".to_string(), size: 4, hash: sha256(b"DATA"), }], casefold_collisions: Vec::new(), }; let report = report(Path::new("."), &manifest).expect("report"); let json = render_report_json(&report); assert!(json.contains("\"schema_version\":\"fparkan-corpus-report-v1\"")); assert!(json.contains("\"fingerprint\":")); assert!(json.contains("\"failures\":1")); assert!(json.contains("\"record_count\":1")); assert!(json.contains("\"metrics\":")); assert!(!json.contains("secret/payload.bin")); assert!(!json.contains("DATA")); } #[test] fn report_records_missing_manifest_files_as_failures() { let root = temp_dir("report-missing"); let manifest = CorpusManifest { kind: CorpusKind::Unknown, files: vec![ManifestEntry { path: "missing.lib".to_string(), size: 1, hash: sha256(b"missing"), }], casefold_collisions: Vec::new(), }; let report = report(&root, &manifest).expect("report"); assert_eq!(report.failures, 1); assert_eq!(report.records.len(), 1); assert_eq!(report.records[0].path, "missing.lib"); assert_eq!(report.records[0].status, CorpusFileStatus::Error); let _ = fs::remove_dir_all(root); } #[test] fn report_records_malformed_nres_as_failure() { let root = temp_dir("report-bad-nres"); fs::write(root.join("bad.lib"), b"NRes").expect("bad nres"); let manifest = CorpusManifest { kind: CorpusKind::Unknown, files: vec![ManifestEntry { path: "bad.lib".to_string(), size: 4, hash: sha256(b"NRes"), }], casefold_collisions: Vec::new(), }; let report = report(&root, &manifest).expect("report"); assert_eq!(report.failures, 1); assert_eq!(report.records[0].status, CorpusFileStatus::Error); assert_eq!(report.records[0].variant, "nres"); assert!(report.records[0] .message .as_deref() .is_some_and(|message| message.contains("NRes"))); let _ = fs::remove_dir_all(root); } #[test] fn report_uses_production_nres_parser_for_entry_metrics() { let root = temp_dir("report-nres"); let archive = build_nres(&[ TestNresEntry { name: "mesh.msh", type_id: 0, payload: b"mesh", }, TestNresEntry { name: "mat.bin", type_id: 0x3054_414D, payload: b"mat0", }, TestNresEntry { name: "texture.bin", type_id: 0x6D78_6554, payload: b"texm", }, ]); fs::write(root.join("archive.lib"), &archive).expect("archive"); let manifest = CorpusManifest { kind: CorpusKind::Unknown, files: vec![ManifestEntry { path: "archive.lib".to_string(), size: u64::try_from(archive.len()).expect("archive size"), hash: sha256(&archive), }], casefold_collisions: Vec::new(), }; let report = report(&root, &manifest).expect("report"); assert_eq!(report.failures, 0); assert_eq!(report.records.len(), 1); assert_eq!(report.records[0].status, CorpusFileStatus::Ok); assert_eq!(report.records[0].variant, "nres"); assert_eq!(report.metrics["nres_files"], 1); assert_eq!(report.metrics["nres_entries"], 3); assert_eq!(report.metrics["msh_entries"], 1); assert_eq!(report.metrics["mat0_entries"], 1); assert_eq!(report.metrics["texm_entries"], 1); let _ = fs::remove_dir_all(root); } #[test] fn deterministic_traversal_is_creation_order_independent() { let first = temp_dir("order-first"); let second = temp_dir("order-second"); fs::create_dir_all(first.join("nested")).expect("first nested"); fs::create_dir_all(second.join("nested")).expect("second nested"); fs::write(first.join("b.bin"), b"b").expect("first b"); fs::write(first.join("nested").join("a.bin"), b"a").expect("first a"); fs::write(second.join("nested").join("a.bin"), b"a").expect("second a"); fs::write(second.join("b.bin"), b"b").expect("second b"); let first_manifest = discover(&first, DiscoverOptions::default()).expect("first manifest"); let second_manifest = discover(&second, DiscoverOptions::default()).expect("second manifest"); assert_eq!(first_manifest.files, second_manifest.files); let _ = fs::remove_dir_all(first); let _ = fs::remove_dir_all(second); } #[cfg(unix)] #[test] fn unreadable_directory_produces_error() { use std::os::unix::fs::PermissionsExt; let root = temp_dir("unreadable"); let child = root.join("locked"); fs::create_dir_all(&child).expect("locked dir"); fs::set_permissions(&child, fs::Permissions::from_mode(0o000)).expect("lock dir"); let result = discover(&root, DiscoverOptions::default()); fs::set_permissions(&child, fs::Permissions::from_mode(0o700)).expect("unlock dir"); let _ = fs::remove_dir_all(root); assert!(matches!(result, Err(CorpusError::Io { path, .. }) if path.ends_with("locked"))); } #[cfg(unix)] #[test] fn symlink_loop_is_not_traversed_by_default() { use std::os::unix::fs::symlink; let root = temp_dir("symlink-loop"); fs::write(root.join("real.bin"), b"real").expect("real file"); symlink(&root, root.join("loop")).expect("loop symlink"); let manifest = discover(&root, DiscoverOptions::default()).expect("manifest"); assert_eq!(manifest.files.len(), 1); assert_eq!(manifest.files[0].path, "real.bin"); let _ = fs::remove_dir_all(root); } #[test] fn casefold_collisions_are_registered() { let manifest = CorpusManifest { kind: CorpusKind::Unknown, files: vec![ ManifestEntry { path: "Textures/Foo.TEX".to_string(), size: 1, hash: sha256(b"first"), }, ManifestEntry { path: "textures/foo.tex".to_string(), size: 1, hash: sha256(b"second"), }, ], casefold_collisions: Vec::new(), }; let collisions = detect_casefold_collisions(&manifest.files); assert_eq!( collisions, vec![vec![ "Textures/Foo.TEX".to_string(), "textures/foo.tex".to_string() ]] ); } #[test] fn fingerprint_changes() { let mut manifest = CorpusManifest { kind: CorpusKind::Unknown, files: vec![ManifestEntry { path: "a".to_string(), size: 1, hash: sha256(b"before"), }], casefold_collisions: Vec::new(), }; let a = fingerprint(&manifest); manifest.files[0].hash = sha256(b"after"); assert_ne!(a, fingerprint(&manifest)); } #[test] fn atomic_report_write() { let tmp = std::env::temp_dir().join(format!( "fparkan-report-{}.json", SystemTime::now() .duration_since(UNIX_EPOCH) .expect("clock") .as_nanos() )); let report = CorpusReport { schema: 1, kind: CorpusKind::Unknown, files: 0, bytes: 0, metrics: BTreeMap::new(), casefold_collisions: 0, fingerprint: sha256(b"empty-report"), records: Vec::new(), failures: 0, }; write_report_atomic(&tmp, &report).expect("write"); assert!(tmp.is_file()); let _ = fs::remove_file(tmp); } struct TestNresEntry<'a> { name: &'a str, type_id: u32, payload: &'a [u8], } fn build_nres(entries: &[TestNresEntry<'_>]) -> Vec { let mut out = vec![0; 16]; let mut offsets = Vec::with_capacity(entries.len()); for entry in entries { offsets.push(u32::try_from(out.len()).expect("offset")); out.extend_from_slice(entry.payload); let padding = (8 - (out.len() % 8)) % 8; out.resize(out.len() + padding, 0); } let mut order: Vec = (0..entries.len()).collect(); order.sort_by(|left, right| { entries[*left] .name .as_bytes() .cmp(entries[*right].name.as_bytes()) }); for (index, entry) in entries.iter().enumerate() { push_u32(&mut out, entry.type_id); push_u32(&mut out, 0); push_u32(&mut out, 0); push_u32( &mut out, u32::try_from(entry.payload.len()).expect("payload size"), ); push_u32(&mut out, 0); let mut name = [0; 36]; let name_bytes = entry.name.as_bytes(); name[..name_bytes.len()].copy_from_slice(name_bytes); out.extend_from_slice(&name); push_u32(&mut out, offsets[index]); push_u32(&mut out, u32::try_from(order[index]).expect("sort index")); } out[0..4].copy_from_slice(b"NRes"); out[4..8].copy_from_slice(&0x100_u32.to_le_bytes()); out[8..12].copy_from_slice(&u32::try_from(entries.len()).expect("count").to_le_bytes()); let total_size = u32::try_from(out.len()).expect("total size"); out[12..16].copy_from_slice(&total_size.to_le_bytes()); out } fn push_u32(out: &mut Vec, value: u32) { out.extend_from_slice(&value.to_le_bytes()); } fn temp_dir(name: &str) -> PathBuf { let path = std::env::temp_dir().join(format!( "fparkan-corpus-{name}-{}", SystemTime::now() .duration_since(UNIX_EPOCH) .expect("clock") .as_nanos() )); fs::create_dir_all(&path).expect("temp dir"); path } fn testdata_root(part: &str) -> PathBuf { licensed_root(part) } fn licensed_root(part: &str) -> PathBuf { let variable = match part { "IS" => "FPARKAN_CORPUS_PART1_ROOT", "IS2" => "FPARKAN_CORPUS_PART2_ROOT", _ => panic!("unknown licensed corpus part: {part}"), }; let root = std::env::var_os(variable) .map(PathBuf::from) .unwrap_or_else(|| panic!("{variable} is required for licensed corpus tests")); assert!( root.is_dir(), "licensed corpus root is missing: {}", root.display() ); root } fn assert_discovered_paths_stay_under_root(part: &str) { let root = testdata_root(part); let manifest = discover(&root, DiscoverOptions::default()).expect("licensed manifest"); for entry in &manifest.files { let normalized = normalize_relative(entry.path.as_bytes(), PathPolicy::HostCompatible) .expect("discovered path should re-normalize"); let joined = join_under(&root, &normalized).expect("discovered path should join"); assert!( joined.starts_with(&root), "discovered path escaped root: {}", entry.path ); } } }