#!/bin/bash # Function to install dependencies install_dependencies() { # Install pv if not already installed if ! command -v pv &> /dev/null; then echo "Installing required tools..." sudo apt update sudo apt install -y pv fi } # Function to combine and deduplicate files in chunks combine_and_deduplicate() { local output_file="$1" shift local files=("$@") local temp_dir=$(mktemp -d) local chunk_dir="$temp_dir/chunks" local temp_combined="$temp_dir/temp_combined.txt" local temp_output="$temp_dir/combined_temp_output.txt" # Ensure cleanup of the temporary directory on exit trap 'rm -rf "$temp_dir"' EXIT mkdir -p "$chunk_dir" echo "Splitting and combining files..." for file in "${files[@]}"; do if [ -f "$file" ]; then # Split large files into smaller chunks split -b 100M "$file" "$chunk_dir/$(basename "$file")_chunk_" else echo "File not found: $file" exit 1 fi done local total_size=$(du -cb "$chunk_dir"/* | grep total$ | cut -f1) echo "Combining chunks..." find "$chunk_dir" -type f -print0 | pv -s "$total_size" | xargs -0 cat > "$temp_combined" echo "Counting total lines..." local total_lines=$(wc -l < "$temp_combined") echo "Total lines: $total_lines" echo "Sorting and deduplicating..." pv "$temp_combined" | sort -u -T "$temp_dir" > "$temp_output" local unique_lines=$(wc -l < "$temp_output") echo "Unique lines: $unique_lines" # Calculate and display statistics local duplicates=$((total_lines - unique_lines)) local duplicate_percentage=0 if [ "$total_lines" -gt 0 ]; then duplicate_percentage=$(echo "scale=2; ($duplicates / $total_lines) * 100" | bc) fi # Move the final sorted and deduplicated output to the desired location mv "$temp_output" "$output_file" echo "Combined and deduplicated file created: $output_file" echo "Total lines: $total_lines" echo "Unique lines: $unique_lines" echo "Duplicates removed: $duplicates" echo "Percentage of duplicates: $duplicate_percentage%" } # Main function main() { install_dependencies read -rp "Enter the full paths to the files separated by spaces: " -a files if [ ${#files[@]} -eq 0 ]; then echo "No files specified. Exiting." exit 1 fi read -rp "Enter the name for the output file: " output_file if [ -z "$output_file" ]; then echo "No output file name specified. Exiting." exit 1 fi combine_and_deduplicate "$output_file" "${files[@]}" } main "$@"