From 11c9a1d89be8d2c89f276cad7de49bc5d82fac60 Mon Sep 17 00:00:00 2001 From: Josh Snyder Date: Wed, 19 Jul 2023 02:01:38 -0700 Subject: [PATCH] Rust dump implementation --- Cargo.lock | 57 +++++++++++++++++++ Cargo.toml | 11 ++++ src/main.rs | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 224 insertions(+) create mode 100644 Cargo.lock create mode 100644 Cargo.toml create mode 100644 src/main.rs diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..8a81503 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,57 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "crc32fast" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "flate2" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b9429470923de8e8cbd4d2dc513535400b4b3fef0319fb5c4e1f520a7bef743" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "happycache" +version = "0.1.0" +dependencies = [ + "flate2", + "libc", +] + +[[package]] +name = "libc" +version = "0.2.147" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" + +[[package]] +name = "miniz_oxide" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7810e0be55b428ada41041c41f32c9f1a42817901b4ccf45fa3d4b6561e74c7" +dependencies = [ + "adler", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..4040a12 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "happycache" +version = "0.1.0" +authors = ["Josh Snyder "] +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +flate2 = "1.0.26" +libc = "0.2.147" diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..0052312 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,156 @@ +use std::fs::File; +use std::io::Write; +use flate2::write::GzEncoder; +use flate2::Compression; +use std::convert::TryInto; +use std::io::Error; +use std::iter::Iterator; +use std::path::Path; +use std::fs::Metadata; +use std::os::fd::AsRawFd; +use std::sync::atomic::{AtomicUsize, Ordering}; + +// The below is copied from: +// https://github.com/RazrFalcon/memmap2-rs/blob/master/src/unix.rs +// Apache2 or MIT licensed +// +// Our primary goal is to avoid an extra `statx` syscall on each mmapped file. + +fn page_size() -> usize { + static PAGE_SIZE: AtomicUsize = AtomicUsize::new(0); + + match PAGE_SIZE.load(Ordering::Relaxed) { + 0 => { + let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) as usize }; + + PAGE_SIZE.store(page_size, Ordering::Relaxed); + + page_size + } + page_size => page_size, + } +} + +pub struct Mmap { + ptr: *mut libc::c_void, + len: usize, +} + +impl Drop for Mmap { + fn drop(&mut self) { + // Any errors during unmapping/closing are ignored as the only way + // to report them would be through panicking which is highly discouraged + // in Drop impls, c.f. https://github.com/rust-lang/lang-team/issues/97 + unsafe { libc::munmap(self.ptr, self.len as libc::size_t) }; + } +} + +// End copy + +fn mmap(file: &File, length: usize) -> Result { + let fd = file.as_raw_fd(); + let ptr = unsafe { libc::mmap( + std::ptr::null_mut(), + length, + 0, + libc::MAP_PRIVATE, + fd, + 0, + ) }; + + if ptr == libc::MAP_FAILED { + Err(Error::last_os_error()) + } else { + Ok(Mmap { ptr: ptr, len: length}) + } +} + + +fn mincore(slice: &Mmap) -> Result, Error> { + let page_size = page_size(); + let pages = (slice.len + page_size - 1) / page_size; + let mut buffer = vec![0_u8; pages]; + + let ret = unsafe { libc::mincore(slice.ptr, slice.len, buffer.as_mut_ptr()) }; + if ret == 0 { + return Ok(buffer) + } + + Err(Error::last_os_error()) +} + +fn dump_file(encoder: &mut impl Write, path: &Path, metadata: &Metadata) -> Result<(), Error> { + let page_size = page_size(); + let input_file = File::open(path)?; + + let mmap = mmap(&input_file, metadata.len().try_into().unwrap())?; + + let mut wrote_header = false; + let mut last = 0; + let mut offset = 0; + let pages = 1<<20; + loop { + let start = offset * page_size; + if start >= mmap.len { + break; + } + + let length = std::cmp::min(pages * page_size, mmap.len - start); + let istart = start.try_into().unwrap(); + let slice = Mmap { + ptr: unsafe { mmap.ptr.offset(istart) }, + len: length + }; + + for (chunk_pos, e) in mincore(&slice)?.into_iter().enumerate() { + let pos = chunk_pos + offset; + if e == 0 { + continue; + } + + // Write the difference to the file, rather than the whole number + // This improves gzip's compression ratio + let diff = (pos - last).to_string(); + last = pos; + + if !wrote_header { + encoder.write_all(path.to_string_lossy().as_bytes())?; + encoder.write_all(b"\n")?; + wrote_header = true; + } + + encoder.write_all(diff.as_bytes())?; + encoder.write_all(b"\n")?; + } + offset += pages; + } + + Ok(()) +} + +fn spider(output: &mut impl Write, directory: &Path) -> std::io::Result<()> { + for entry in directory.read_dir()? { + let path = entry?.path(); + let metadata = path.metadata()?; + + if metadata.is_dir() { + spider(output, &path)?; + } else if metadata.is_file() && metadata.len() > 0 { + dump_file(output, &path, &metadata)?; + } + } + + Ok(()) +} + +fn main() -> std::io::Result<()> { + let mut encoder = { + let output_file = File::create(".happycache.gz")?; + GzEncoder::new(output_file, Compression::default()) + }; + + spider(&mut encoder, Path::new("."))?; + encoder.finish()?; + + Ok(()) +}