diff --git a/Combine_Wordlists b/Combine_Wordlists new file mode 100644 index 0000000..249d6b6 --- /dev/null +++ b/Combine_Wordlists @@ -0,0 +1,91 @@ +#!/bin/bash + +# Function to install dependencies +install_dependencies() { + # Install pv if not already installed + if ! command -v pv &> /dev/null; then + echo "Installing required tools..." + sudo apt update + sudo apt install -y pv + fi +} + +# Function to combine and deduplicate files in chunks +combine_and_deduplicate() { + local output_file="$1" + shift + local files=("$@") + local temp_dir=$(mktemp -d) + local chunk_dir="$temp_dir/chunks" + local temp_combined="$temp_dir/temp_combined.txt" + local temp_output="$temp_dir/combined_temp_output.txt" + + # Ensure cleanup of the temporary directory on exit + trap 'rm -rf "$temp_dir"' EXIT + + mkdir -p "$chunk_dir" + + echo "Splitting and combining files..." + for file in "${files[@]}"; do + if [ -f "$file" ]; then + # Split large files into smaller chunks + split -b 100M "$file" "$chunk_dir/$(basename "$file")_chunk_" + else + echo "File not found: $file" + exit 1 + fi + done + + local total_size=$(du -cb "$chunk_dir"/* | grep total$ | cut -f1) + + echo "Combining chunks..." + find "$chunk_dir" -type f -print0 | pv -s "$total_size" | xargs -0 cat > "$temp_combined" + + echo "Counting total lines..." + local total_lines=$(wc -l < "$temp_combined") + echo "Total lines: $total_lines" + + echo "Sorting and deduplicating..." + pv "$temp_combined" | sort -u -T "$temp_dir" > "$temp_output" + local unique_lines=$(wc -l < "$temp_output") + echo "Unique lines: $unique_lines" + + # Calculate and display statistics + local duplicates=$((total_lines - unique_lines)) + local duplicate_percentage=0 + if [ "$total_lines" -gt 0 ]; then + duplicate_percentage=$(echo "scale=2; ($duplicates / $total_lines) * 100" | bc) + fi + + # Move the final sorted and deduplicated output to the desired location + mv "$temp_output" "$output_file" + + echo "Combined and deduplicated file created: $output_file" + echo "Total lines: $total_lines" + echo "Unique lines: $unique_lines" + echo "Duplicates removed: $duplicates" + echo "Percentage of duplicates: $duplicate_percentage%" +} + +# Main function +main() { + install_dependencies + + read -rp "Enter the full paths to the files separated by spaces: " -a files + + if [ ${#files[@]} -eq 0 ]; then + echo "No files specified. Exiting." + exit 1 + fi + + read -rp "Enter the name for the output file: " output_file + + if [ -z "$output_file" ]; then + echo "No output file name specified. Exiting." + exit 1 + fi + + combine_and_deduplicate "$output_file" "${files[@]}" +} + +main "$@"