selfmade_scripts/Combine_Wordlists
2026-04-09 17:07:00 +00:00

91 lines
2.6 KiB
Bash

#!/bin/bash
# Function to install dependencies
install_dependencies() {
# Install pv if not already installed
if ! command -v pv &> /dev/null; then
echo "Installing required tools..."
sudo apt update
sudo apt install -y pv
fi
}
# Function to combine and deduplicate files in chunks
combine_and_deduplicate() {
local output_file="$1"
shift
local files=("$@")
local temp_dir=$(mktemp -d)
local chunk_dir="$temp_dir/chunks"
local temp_combined="$temp_dir/temp_combined.txt"
local temp_output="$temp_dir/combined_temp_output.txt"
# Ensure cleanup of the temporary directory on exit
trap 'rm -rf "$temp_dir"' EXIT
mkdir -p "$chunk_dir"
echo "Splitting and combining files..."
for file in "${files[@]}"; do
if [ -f "$file" ]; then
# Split large files into smaller chunks
split -b 100M "$file" "$chunk_dir/$(basename "$file")_chunk_"
else
echo "File not found: $file"
exit 1
fi
done
local total_size=$(du -cb "$chunk_dir"/* | grep total$ | cut -f1)
echo "Combining chunks..."
find "$chunk_dir" -type f -print0 | pv -s "$total_size" | xargs -0 cat > "$temp_combined"
echo "Counting total lines..."
local total_lines=$(wc -l < "$temp_combined")
echo "Total lines: $total_lines"
echo "Sorting and deduplicating..."
pv "$temp_combined" | sort -u -T "$temp_dir" > "$temp_output"
local unique_lines=$(wc -l < "$temp_output")
echo "Unique lines: $unique_lines"
# Calculate and display statistics
local duplicates=$((total_lines - unique_lines))
local duplicate_percentage=0
if [ "$total_lines" -gt 0 ]; then
duplicate_percentage=$(echo "scale=2; ($duplicates / $total_lines) * 100" | bc)
fi
# Move the final sorted and deduplicated output to the desired location
mv "$temp_output" "$output_file"
echo "Combined and deduplicated file created: $output_file"
echo "Total lines: $total_lines"
echo "Unique lines: $unique_lines"
echo "Duplicates removed: $duplicates"
echo "Percentage of duplicates: $duplicate_percentage%"
}
# Main function
main() {
install_dependencies
read -rp "Enter the full paths to the files separated by spaces: " -a files
if [ ${#files[@]} -eq 0 ]; then
echo "No files specified. Exiting."
exit 1
fi
read -rp "Enter the name for the output file: " output_file
if [ -z "$output_file" ]; then
echo "No output file name specified. Exiting."
exit 1
fi
combine_and_deduplicate "$output_file" "${files[@]}"
}
main "$@"