91 lines
2.6 KiB
Bash
91 lines
2.6 KiB
Bash
#!/bin/bash
|
|
|
|
# Function to install dependencies
|
|
install_dependencies() {
|
|
# Install pv if not already installed
|
|
if ! command -v pv &> /dev/null; then
|
|
echo "Installing required tools..."
|
|
sudo apt update
|
|
sudo apt install -y pv
|
|
fi
|
|
}
|
|
|
|
# Function to combine and deduplicate files in chunks
|
|
combine_and_deduplicate() {
|
|
local output_file="$1"
|
|
shift
|
|
local files=("$@")
|
|
local temp_dir=$(mktemp -d)
|
|
local chunk_dir="$temp_dir/chunks"
|
|
local temp_combined="$temp_dir/temp_combined.txt"
|
|
local temp_output="$temp_dir/combined_temp_output.txt"
|
|
|
|
# Ensure cleanup of the temporary directory on exit
|
|
trap 'rm -rf "$temp_dir"' EXIT
|
|
|
|
mkdir -p "$chunk_dir"
|
|
|
|
echo "Splitting and combining files..."
|
|
for file in "${files[@]}"; do
|
|
if [ -f "$file" ]; then
|
|
# Split large files into smaller chunks
|
|
split -b 100M "$file" "$chunk_dir/$(basename "$file")_chunk_"
|
|
else
|
|
echo "File not found: $file"
|
|
exit 1
|
|
fi
|
|
done
|
|
|
|
local total_size=$(du -cb "$chunk_dir"/* | grep total$ | cut -f1)
|
|
|
|
echo "Combining chunks..."
|
|
find "$chunk_dir" -type f -print0 | pv -s "$total_size" | xargs -0 cat > "$temp_combined"
|
|
|
|
echo "Counting total lines..."
|
|
local total_lines=$(wc -l < "$temp_combined")
|
|
echo "Total lines: $total_lines"
|
|
|
|
echo "Sorting and deduplicating..."
|
|
pv "$temp_combined" | sort -u -T "$temp_dir" > "$temp_output"
|
|
local unique_lines=$(wc -l < "$temp_output")
|
|
echo "Unique lines: $unique_lines"
|
|
|
|
# Calculate and display statistics
|
|
local duplicates=$((total_lines - unique_lines))
|
|
local duplicate_percentage=0
|
|
if [ "$total_lines" -gt 0 ]; then
|
|
duplicate_percentage=$(echo "scale=2; ($duplicates / $total_lines) * 100" | bc)
|
|
fi
|
|
|
|
# Move the final sorted and deduplicated output to the desired location
|
|
mv "$temp_output" "$output_file"
|
|
|
|
echo "Combined and deduplicated file created: $output_file"
|
|
echo "Total lines: $total_lines"
|
|
echo "Unique lines: $unique_lines"
|
|
echo "Duplicates removed: $duplicates"
|
|
echo "Percentage of duplicates: $duplicate_percentage%"
|
|
}
|
|
|
|
# Main function
|
|
main() {
|
|
install_dependencies
|
|
|
|
read -rp "Enter the full paths to the files separated by spaces: " -a files
|
|
|
|
if [ ${#files[@]} -eq 0 ]; then
|
|
echo "No files specified. Exiting."
|
|
exit 1
|
|
fi
|
|
|
|
read -rp "Enter the name for the output file: " output_file
|
|
|
|
if [ -z "$output_file" ]; then
|
|
echo "No output file name specified. Exiting."
|
|
exit 1
|
|
fi
|
|
|
|
combine_and_deduplicate "$output_file" "${files[@]}"
|
|
}
|
|
|
|
main "$@"
|