From fd292a53be38e25b286585a3955dea2f060fe699 Mon Sep 17 00:00:00 2001 From: nick Date: Mon, 11 Mar 2024 22:11:11 -0400 Subject: [PATCH] added parallelism --- src/directory.rs | 57 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 9 deletions(-) diff --git a/src/directory.rs b/src/directory.rs index 34707a3..6bfdfdc 100644 --- a/src/directory.rs +++ b/src/directory.rs @@ -5,6 +5,8 @@ use std::io::{Result, ErrorKind}; use crate::unit::Unit; +use rayon::prelude::*; + #[derive(Debug, Clone)] pub struct Directory { name: PathBuf, @@ -52,15 +54,51 @@ impl Directory { } }; - let mut size = 0; - let mut children = Vec::new(); - for entry in dir { - let child = Self::new(entry?.path())?; - size += child.size; - children.push(child); - } + // this is a compicated iterator pattern. I'll do my best to explain. + // 1. the end result is that we `reduce()` the iterator to a single + // (u64, Vec) tuple to return. this is done by... + let (size, children) = + // 2. taking the iterator over the directory and parallelising it... + dir.par_bridge() + // 3, this is the recursive step: try to create new Directory + // objects from each item in the iterator + .map(|entry| Self::new(entry?.path())) + // 4. the fold (this is try_fold because we're iterating over Result.). + // each fold adds a directory as a child and increases the total size + .try_fold( + || (0, Vec::new()), + |(mut size, mut children), dir| { + let dir = dir?; + size += dir.size; + children.push(dir); + // have to specify std::io::Result::Ok otherwise it complains + // that it can't infer the E in Result + Result::Ok((size, children)) + } + ) + // 5. the final step is to reduce, which is as simple as concatenating + // every vector and summing up their sizes. + .try_reduce( + || (0, Vec::new()), + |(asize, mut avec), (bsize, bvec)| { + avec.extend(bvec); + Ok((asize + bsize, avec)) + } + )?; - Ok(Self{ + // final notes: + // 1. I am unsure if it is better to do a bunch of partial sums + // during the fold() and reduce() steps, or if it is best to + // have them only do data collection and sum the lengths + // later. intuitively we would want to do everything in + // parallel but I have no data to support this + // 2. this is a super complicated iterator pattern, If anyone + // knows how to simplify it I'm all ears, but this being + // parallel is the main advantage it has over du so I don't + // want to abandon that, even though a serial for loop is + // *incredibly* clearer + + Ok(Self { name, size, children, @@ -74,7 +112,8 @@ impl Directory { let mut result = self.vectorise(unit).iter().map(|e| e.stringify_tabbed(tab_size) + "\n").collect::(); if ! result.is_empty() { - let final_newline_char_range = result.len()-2 .. result.len(); + // unless there was no output, remove the final "\n" + let final_newline_char_range = result.len()-"\n".len() .. result.len(); result.drain(final_newline_char_range); }