diff options
Diffstat (limited to 'crates/fparkan-corpus/src')
| -rw-r--r-- | crates/fparkan-corpus/src/lib.rs | 695 |
1 files changed, 695 insertions, 0 deletions
diff --git a/crates/fparkan-corpus/src/lib.rs b/crates/fparkan-corpus/src/lib.rs new file mode 100644 index 0000000..ba26c73 --- /dev/null +++ b/crates/fparkan-corpus/src/lib.rs @@ -0,0 +1,695 @@ +#![forbid(unsafe_code)] +//! Licensed corpus discovery and aggregate reports. + +use fparkan_path::{ascii_lookup_key, normalize_relative, PathPolicy}; +use std::collections::{BTreeMap, BTreeSet}; +use std::fmt; +use std::fs; +use std::io::Write; +use std::path::{Path, PathBuf}; + +/// Corpus kind. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum CorpusKind { + /// Demo corpus. + Demo, + /// Part 1 full game. + Part1, + /// Part 2 full game. + Part2, + /// Unknown local directory. + Unknown, +} + +/// Corpus root. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct CorpusRoot(pub PathBuf); + +/// Discovery options. +#[derive(Clone, Copy, Debug, Default)] +pub struct DiscoverOptions { + /// Whether symlinks may be traversed. + pub follow_symlinks: bool, +} + +/// File manifest entry. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct ManifestEntry { + /// Normalized relative path. + pub path: String, + /// File size in bytes. + pub size: u64, + /// Stable content fingerprint. + pub hash: u64, +} + +/// Corpus manifest. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct CorpusManifest { + /// Kind. + pub kind: CorpusKind, + /// Sorted files. + pub files: Vec<ManifestEntry>, + /// Casefold collisions. + pub casefold_collisions: Vec<Vec<String>>, +} + +/// Aggregate report. +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct CorpusReport { + /// Schema version. + pub schema: u32, + /// Kind. + pub kind: CorpusKind, + /// Total files. + pub files: usize, + /// Total bytes. + pub bytes: u64, + /// Metrics. + pub metrics: BTreeMap<String, u64>, + /// Casefold collision count. + pub casefold_collisions: usize, + /// Manifest fingerprint. + pub fingerprint: u64, +} + +/// Corpus error. +#[derive(Debug)] +pub enum CorpusError { + /// I/O failure. + Io { + /// Path where I/O failed. + path: PathBuf, + /// Source error. + source: std::io::Error, + }, + /// Invalid root. + InvalidRoot(PathBuf), + /// Invalid path. + InvalidPath(String), +} + +impl fmt::Display for CorpusError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Io { path, source } => write!(f, "{}: {source}", path.display()), + Self::InvalidRoot(path) => write!(f, "invalid corpus root: {}", path.display()), + Self::InvalidPath(path) => write!(f, "invalid corpus path: {path}"), + } + } +} + +impl std::error::Error for CorpusError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + Self::Io { source, .. } => Some(source), + Self::InvalidRoot(_) | Self::InvalidPath(_) => None, + } + } +} + +/// Discovers a corpus under a root directory. +/// +/// # Errors +/// +/// Returns [`CorpusError`] if the root is invalid, traversal encounters an I/O +/// error, or a discovered path cannot be represented by the legacy path policy. +pub fn discover(root: &Path, options: DiscoverOptions) -> Result<CorpusManifest, CorpusError> { + if !root.is_dir() { + return Err(CorpusError::InvalidRoot(root.to_path_buf())); + } + let mut files = Vec::new(); + walk(root, root, options, &mut files)?; + files.sort_by(|a, b| a.path.cmp(&b.path)); + + let kind = classify(root, &files); + let casefold_collisions = detect_casefold_collisions(&files); + Ok(CorpusManifest { + kind, + files, + casefold_collisions, + }) +} + +fn walk( + root: &Path, + dir: &Path, + options: DiscoverOptions, + out: &mut Vec<ManifestEntry>, +) -> Result<(), CorpusError> { + let read_dir = fs::read_dir(dir).map_err(|source| CorpusError::Io { + path: dir.to_path_buf(), + source, + })?; + let mut entries = Vec::new(); + for entry in read_dir { + let entry = entry.map_err(|source| CorpusError::Io { + path: dir.to_path_buf(), + source, + })?; + entries.push(entry.path()); + } + entries.sort(); + for path in entries { + if path + .file_name() + .and_then(|name| name.to_str()) + .is_some_and(|name| name.starts_with('.')) + { + continue; + } + let metadata = fs::symlink_metadata(&path).map_err(|source| CorpusError::Io { + path: path.clone(), + source, + })?; + if metadata.file_type().is_symlink() && !options.follow_symlinks { + continue; + } + if metadata.is_dir() { + walk(root, &path, options, out)?; + continue; + } + if !metadata.is_file() { + continue; + } + let rel = path + .strip_prefix(root) + .map_err(|_| CorpusError::InvalidPath(path.display().to_string()))?; + let rel_text = rel + .to_str() + .ok_or_else(|| CorpusError::InvalidPath(path.display().to_string()))?; + let normalized = normalize_relative(rel_text.as_bytes(), PathPolicy::HostCompatible) + .map_err(|_| CorpusError::InvalidPath(rel_text.to_string()))?; + let bytes = fs::read(&path).map_err(|source| CorpusError::Io { + path: path.clone(), + source, + })?; + out.push(ManifestEntry { + path: normalized.as_str().to_string(), + size: metadata.len(), + hash: stable_hash(&bytes), + }); + } + Ok(()) +} + +fn classify(root: &Path, files: &[ManifestEntry]) -> CorpusKind { + let name = root + .file_name() + .and_then(|v| v.to_str()) + .unwrap_or_default() + .to_ascii_uppercase(); + if name == "IS" { + CorpusKind::Part1 + } else if name == "IS2" { + CorpusKind::Part2 + } else if files + .iter() + .any(|f| f.path.eq_ignore_ascii_case("iron_3d.exe")) + { + CorpusKind::Part1 + } else { + CorpusKind::Unknown + } +} + +fn detect_casefold_collisions(files: &[ManifestEntry]) -> Vec<Vec<String>> { + let mut grouped: BTreeMap<Vec<u8>, BTreeSet<String>> = BTreeMap::new(); + for file in files { + grouped + .entry(ascii_lookup_key(file.path.as_bytes()).0) + .or_default() + .insert(file.path.clone()); + } + grouped + .into_values() + .filter(|paths| paths.len() > 1) + .map(|paths| paths.into_iter().collect()) + .collect() +} + +/// Builds aggregate report. +#[must_use] +pub fn report(root: &Path, manifest: &CorpusManifest) -> CorpusReport { + let mut metrics = BTreeMap::new(); + metrics.insert("nres_files".to_string(), 0); + metrics.insert("nres_entries".to_string(), 0); + metrics.insert("rsli_files".to_string(), 0); + metrics.insert("tma_files".to_string(), 0); + metrics.insert("land_msh_files".to_string(), 0); + metrics.insert("land_map_files".to_string(), 0); + metrics.insert("unit_dat_files".to_string(), 0); + metrics.insert("msh_entries".to_string(), 0); + metrics.insert("mat0_entries".to_string(), 0); + metrics.insert("texm_entries".to_string(), 0); + metrics.insert("fxid_entries".to_string(), 0); + metrics.insert("wear_entries".to_string(), 0); + + for entry in &manifest.files { + let lower = entry.path.to_ascii_lowercase(); + if lower.ends_with("data.tma") { + bump(&mut metrics, "tma_files", 1); + } + if lower.ends_with("land.msh") { + bump(&mut metrics, "land_msh_files", 1); + } + if lower.ends_with("land.map") { + bump(&mut metrics, "land_map_files", 1); + } + if has_extension(&lower, "dat") + && (lower.starts_with("units/") || lower.contains("/units/")) + { + bump(&mut metrics, "unit_dat_files", 1); + } + + let path = root.join(&entry.path); + if let Ok(bytes) = fs::read(path) { + if bytes.starts_with(b"NRes") { + bump(&mut metrics, "nres_files", 1); + if let Some(entries) = inspect_nres_entries(&bytes) { + bump(&mut metrics, "nres_entries", entries.len() as u64); + for entry in entries { + let name = entry.name.to_ascii_lowercase(); + if has_extension(&name, "msh") { + bump(&mut metrics, "msh_entries", 1); + } + match entry.kind { + 0x3054_414D => { + bump(&mut metrics, "mat0_entries", 1); + } + 0x6D78_6554 => { + bump(&mut metrics, "texm_entries", 1); + } + 0x4449_5846 => { + bump(&mut metrics, "fxid_entries", 1); + } + 0x5241_4557 => { + bump(&mut metrics, "wear_entries", 1); + } + _ => {} + } + } + } + } else if bytes.starts_with(b"NL") { + bump(&mut metrics, "rsli_files", 1); + } + } + } + + CorpusReport { + schema: 1, + kind: manifest.kind, + files: manifest.files.len(), + bytes: manifest.files.iter().map(|f| f.size).sum(), + metrics, + casefold_collisions: manifest.casefold_collisions.len(), + fingerprint: fingerprint(manifest), + } +} + +fn bump(metrics: &mut BTreeMap<String, u64>, key: &str, delta: u64) { + if let Some(value) = metrics.get_mut(key) { + *value = value.saturating_add(delta); + } +} + +fn has_extension(path: &str, expected: &str) -> bool { + Path::new(path) + .extension() + .is_some_and(|extension| extension.eq_ignore_ascii_case(expected)) +} + +#[derive(Clone, Debug)] +struct NresEntryBrief { + kind: u32, + name: String, +} + +fn inspect_nres_entries(bytes: &[u8]) -> Option<Vec<NresEntryBrief>> { + if bytes.len() < 16 || !bytes.starts_with(b"NRes") { + return None; + } + let count = i32::from_le_bytes(bytes.get(8..12)?.try_into().ok()?); + if count < 0 { + return None; + } + let count = usize::try_from(count).ok()?; + let directory_len = count.checked_mul(64)?; + let directory_offset = bytes.len().checked_sub(directory_len)?; + let mut names = Vec::with_capacity(count); + for index in 0..count { + let base = directory_offset.checked_add(index.checked_mul(64)?)?; + let kind = u32::from_le_bytes(bytes.get(base..base + 4)?.try_into().ok()?); + let raw = bytes.get(base + 20..base + 56)?; + let len = raw.iter().position(|b| *b == 0).unwrap_or(raw.len()); + names.push(NresEntryBrief { + kind, + name: String::from_utf8_lossy(&raw[..len]).to_string(), + }); + } + Some(names) +} + +/// Computes stable manifest fingerprint. +#[must_use] +pub fn fingerprint(manifest: &CorpusManifest) -> u64 { + let mut state = 0xcbf2_9ce4_8422_2325; + for file in &manifest.files { + hash_into(&mut state, file.path.as_bytes()); + hash_into(&mut state, &file.size.to_le_bytes()); + hash_into(&mut state, &file.hash.to_le_bytes()); + } + state +} + +fn stable_hash(bytes: &[u8]) -> u64 { + let mut state = 0xcbf2_9ce4_8422_2325; + hash_into(&mut state, bytes); + state +} + +fn hash_into(state: &mut u64, bytes: &[u8]) { + for byte in bytes { + *state ^= u64::from(*byte); + *state = state.wrapping_mul(0x0000_0100_0000_01b3); + } +} + +/// Writes report atomically. +/// +/// # Errors +/// +/// Returns [`CorpusError`] if the parent directory, temporary file, write, or +/// final rename operation fails. +pub fn write_report_atomic(path: &Path, report: &CorpusReport) -> Result<(), CorpusError> { + let tmp = path.with_extension("tmp"); + if let Some(parent) = path.parent() { + fs::create_dir_all(parent).map_err(|source| CorpusError::Io { + path: parent.to_path_buf(), + source, + })?; + } + let mut file = fs::File::create(&tmp).map_err(|source| CorpusError::Io { + path: tmp.clone(), + source, + })?; + file.write_all(render_report_json(report).as_bytes()) + .map_err(|source| CorpusError::Io { + path: tmp.clone(), + source, + })?; + file.sync_all().map_err(|source| CorpusError::Io { + path: tmp.clone(), + source, + })?; + fs::rename(&tmp, path).map_err(|source| CorpusError::Io { + path: path.to_path_buf(), + source, + })?; + Ok(()) +} + +/// Renders report JSON. +#[must_use] +pub fn render_report_json(report: &CorpusReport) -> String { + let mut out = format!( + "{{\"schema_version\":\"fparkan-corpus-report-v1\",\"schema\":{},\"kind\":\"{:?}\",\"files\":{},\"bytes\":{},\"casefold_collisions\":{},\"fingerprint\":\"{:016x}\",\"metrics\":{{", + report.schema, + report.kind, + report.files, + report.bytes, + report.casefold_collisions, + report.fingerprint + ); + for (idx, (key, value)) in report.metrics.iter().enumerate() { + if idx > 0 { + out.push(','); + } + out.push('"'); + out.push_str(key); + out.push_str("\":"); + out.push_str(&value.to_string()); + } + out.push_str("}}"); + out.push('}'); + out +} + +#[cfg(test)] +mod tests { + use super::*; + use fparkan_path::join_under; + use std::time::{SystemTime, UNIX_EPOCH}; + + #[test] + fn report_for_testdata_roots() { + let root = Path::new(env!("CARGO_MANIFEST_DIR")) + .join("../..") + .join("testdata") + .join("IS"); + if !root.is_dir() { + return; + } + let manifest = discover(&root, DiscoverOptions::default()).expect("manifest"); + let report = report(&root, &manifest); + assert!(report.files > 0); + assert!(report.metrics["nres_files"] > 0); + } + + #[test] + fn licensed_part1_manifest_profile_and_counts_match_baseline() { + let root = testdata_root("IS"); + let manifest = discover(&root, DiscoverOptions::default()).expect("part 1 manifest"); + let report = report(&root, &manifest); + + assert_eq!(manifest.kind, CorpusKind::Part1); + assert_eq!(report.files, 1_017); + assert_eq!(report.metrics["nres_files"], 120); + assert_eq!(report.metrics["rsli_files"], 2); + assert_eq!(report.metrics["tma_files"], 29); + assert_eq!(report.metrics["land_msh_files"], 33); + assert_eq!(report.metrics["land_map_files"], 33); + assert_eq!(report.metrics["unit_dat_files"], 425); + } + + #[test] + fn licensed_part2_manifest_profile_and_counts_match_baseline() { + let root = testdata_root("IS2"); + let manifest = discover(&root, DiscoverOptions::default()).expect("part 2 manifest"); + let report = report(&root, &manifest); + + assert_eq!(manifest.kind, CorpusKind::Part2); + assert_eq!(report.files, 1_302); + assert_eq!(report.metrics["nres_files"], 134); + assert_eq!(report.metrics["rsli_files"], 2); + assert_eq!(report.metrics["tma_files"], 31); + assert_eq!(report.metrics["land_msh_files"], 32); + assert_eq!(report.metrics["land_map_files"], 32); + assert_eq!(report.metrics["unit_dat_files"], 676); + } + + #[test] + fn licensed_part1_has_no_casefold_relative_path_collisions() { + let root = testdata_root("IS"); + let manifest = discover(&root, DiscoverOptions::default()).expect("part 1 manifest"); + + assert!(manifest.casefold_collisions.is_empty()); + } + + #[test] + fn licensed_part2_has_no_casefold_relative_path_collisions() { + let root = testdata_root("IS2"); + let manifest = discover(&root, DiscoverOptions::default()).expect("part 2 manifest"); + + assert!(manifest.casefold_collisions.is_empty()); + } + + #[test] + fn licensed_part1_paths_stay_under_root() { + assert_discovered_paths_stay_under_root("IS"); + } + + #[test] + fn licensed_part2_paths_stay_under_root() { + assert_discovered_paths_stay_under_root("IS2"); + } + + #[test] + fn report_json_contains_metrics_and_hashes_not_paths_or_payloads() { + let manifest = CorpusManifest { + kind: CorpusKind::Part1, + files: vec![ManifestEntry { + path: "secret/payload.bin".to_string(), + size: 4, + hash: stable_hash(b"DATA"), + }], + casefold_collisions: Vec::new(), + }; + let report = report(Path::new("."), &manifest); + let json = render_report_json(&report); + + assert!(json.contains("\"schema_version\":\"fparkan-corpus-report-v1\"")); + assert!(json.contains("\"fingerprint\":")); + assert!(json.contains("\"metrics\":")); + assert!(!json.contains("secret/payload.bin")); + assert!(!json.contains("DATA")); + } + + #[test] + fn deterministic_traversal_is_creation_order_independent() { + let first = temp_dir("order-first"); + let second = temp_dir("order-second"); + fs::create_dir_all(first.join("nested")).expect("first nested"); + fs::create_dir_all(second.join("nested")).expect("second nested"); + + fs::write(first.join("b.bin"), b"b").expect("first b"); + fs::write(first.join("nested").join("a.bin"), b"a").expect("first a"); + fs::write(second.join("nested").join("a.bin"), b"a").expect("second a"); + fs::write(second.join("b.bin"), b"b").expect("second b"); + + let first_manifest = discover(&first, DiscoverOptions::default()).expect("first manifest"); + let second_manifest = + discover(&second, DiscoverOptions::default()).expect("second manifest"); + + assert_eq!(first_manifest.files, second_manifest.files); + let _ = fs::remove_dir_all(first); + let _ = fs::remove_dir_all(second); + } + + #[cfg(unix)] + #[test] + fn unreadable_directory_produces_error() { + use std::os::unix::fs::PermissionsExt; + + let root = temp_dir("unreadable"); + let child = root.join("locked"); + fs::create_dir_all(&child).expect("locked dir"); + fs::set_permissions(&child, fs::Permissions::from_mode(0o000)).expect("lock dir"); + + let result = discover(&root, DiscoverOptions::default()); + + fs::set_permissions(&child, fs::Permissions::from_mode(0o700)).expect("unlock dir"); + let _ = fs::remove_dir_all(root); + assert!(matches!(result, Err(CorpusError::Io { path, .. }) if path.ends_with("locked"))); + } + + #[cfg(unix)] + #[test] + fn symlink_loop_is_not_traversed_by_default() { + use std::os::unix::fs::symlink; + + let root = temp_dir("symlink-loop"); + fs::write(root.join("real.bin"), b"real").expect("real file"); + symlink(&root, root.join("loop")).expect("loop symlink"); + + let manifest = discover(&root, DiscoverOptions::default()).expect("manifest"); + + assert_eq!(manifest.files.len(), 1); + assert_eq!(manifest.files[0].path, "real.bin"); + let _ = fs::remove_dir_all(root); + } + + #[test] + fn casefold_collisions_are_registered() { + let manifest = CorpusManifest { + kind: CorpusKind::Unknown, + files: vec![ + ManifestEntry { + path: "Textures/Foo.TEX".to_string(), + size: 1, + hash: 1, + }, + ManifestEntry { + path: "textures/foo.tex".to_string(), + size: 1, + hash: 2, + }, + ], + casefold_collisions: Vec::new(), + }; + + let collisions = detect_casefold_collisions(&manifest.files); + + assert_eq!( + collisions, + vec![vec![ + "Textures/Foo.TEX".to_string(), + "textures/foo.tex".to_string() + ]] + ); + } + + #[test] + fn fingerprint_changes() { + let mut manifest = CorpusManifest { + kind: CorpusKind::Unknown, + files: vec![ManifestEntry { + path: "a".to_string(), + size: 1, + hash: 1, + }], + casefold_collisions: Vec::new(), + }; + let a = fingerprint(&manifest); + manifest.files[0].hash = 2; + assert_ne!(a, fingerprint(&manifest)); + } + + #[test] + fn atomic_report_write() { + let tmp = std::env::temp_dir().join(format!( + "fparkan-report-{}.json", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("clock") + .as_nanos() + )); + let report = CorpusReport { + schema: 1, + kind: CorpusKind::Unknown, + files: 0, + bytes: 0, + metrics: BTreeMap::new(), + casefold_collisions: 0, + fingerprint: 0, + }; + write_report_atomic(&tmp, &report).expect("write"); + assert!(tmp.is_file()); + let _ = fs::remove_file(tmp); + } + + fn temp_dir(name: &str) -> PathBuf { + let path = std::env::temp_dir().join(format!( + "fparkan-corpus-{name}-{}", + SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("clock") + .as_nanos() + )); + fs::create_dir_all(&path).expect("temp dir"); + path + } + + fn testdata_root(part: &str) -> PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .join("../..") + .join("testdata") + .join(part) + } + + fn assert_discovered_paths_stay_under_root(part: &str) { + let root = testdata_root(part); + let manifest = discover(&root, DiscoverOptions::default()).expect("licensed manifest"); + + for entry in &manifest.files { + let normalized = normalize_relative(entry.path.as_bytes(), PathPolicy::HostCompatible) + .expect("discovered path should re-normalize"); + let joined = join_under(&root, &normalized).expect("discovered path should join"); + assert!( + joined.starts_with(&root), + "discovered path escaped root: {}", + entry.path + ); + } + } +} |
