鉴于部分玩家提出的fnos目前尚未集成文件查重功能,特提供debian下文件查重/删除交互脚本,方便大家需要进行文件查重删除时使用。
sudo nano ./duplicate_finder_delete.sh
#!/bin/bash
# 文件查重脚本(支持多目录)
# 用法:
# 单目录: ./duplicate_finder_delete.sh /path/to/search [mode]
# 多目录: ./duplicate_finder_delete.sh "/path1 /path2 /path3" [mode]
# 通配符: ./duplicate_finder_delete.sh "/parent/test*" [mode]
SEARCH_PATHS="$1"
MODE="$2"
if [[ -z "$SEARCH_PATHS" ]]; then
echo "用法: $0 '<目录路径>' [preview|auto]"
echo "示例:"
echo " 单目录: $0 /vol1/1000/test preview"
echo " 多目录: $0 '/vol1/1000/test /vol1/1000/test1' preview"
echo " 通配符: $0 '/vol1/1000/test*' preview"
echo " 不同位置: $0 '/vol1/1000/test /vol2/backup/test1 /home/user/docs' preview"
exit 1
fi
# 解析和验证目录路径
declare -a VALID_DIRS
if [[ "$SEARCH_PATHS" == *" "* ]]; then
# 多个路径(空格分隔)
IFS=' ' read -ra PATHS <<< "$SEARCH_PATHS"
for path in "${PATHS[@]}"; do
# 处理通配符
if [[ "$path" == *"*"* ]]; then
expanded_paths=($(echo $path))
for exp_path in "${expanded_paths[@]}"; do
if [[ -d "$exp_path" ]]; then
VALID_DIRS+=("$exp_path")
else
echo "警告: 目录不存在或不是目录: $exp_path"
fi
done
else
if [[ -d "$path" ]]; then
VALID_DIRS+=("$path")
else
echo "警告: 目录不存在或不是目录: $path"
fi
fi
done
else
# 单个路径(可能包含通配符)
if [[ "$SEARCH_PATHS" == *"*"* ]]; then
expanded_paths=($(echo $SEARCH_PATHS))
for exp_path in "${expanded_paths[@]}"; do
if [[ -d "$exp_path" ]]; then
VALID_DIRS+=("$exp_path")
else
echo "警告: 目录不存在或不是目录: $exp_path"
fi
done
else
if [[ -d "$SEARCH_PATHS" ]]; then
VALID_DIRS+=("$SEARCH_PATHS")
else
echo "错误: 目录不存在或不是目录: $SEARCH_PATHS"
exit 1
fi
fi
fi
if [[ ${#VALID_DIRS[@]} -eq 0 ]]; then
echo "错误: 没有找到有效的目录"
exit 1
fi
echo "正在扫描以下目录:"
for dir in "${VALID_DIRS[@]}"; do
echo " - $dir"
done
case "$MODE" in
auto)
echo "运行模式: 自动删除(保留最新修改的文件)"
;;
preview)
echo "运行模式: 预览(只显示重复文件,不删除)"
;;
*)
echo "运行模式: 交互选择删除"
;;
esac
echo "------------------------------------------------------"
TMP_DIR=$(mktemp -d)
SIZE_FILE="$TMP_DIR/sizes"
HASH_FILE="$TMP_DIR/hashes"
TOTAL_DUPLICATES=0
TOTAL_SIZE=0
DUPLICATE_GROUPS=0
# 第一步:获取所有目录中文件的大小和路径
echo "正在扫描文件大小..."
for dir in "${VALID_DIRS[@]}"; do
echo " 扫描: $dir"
find "$dir" -type f -printf "%s:%p\n" >> "$SIZE_FILE"
done
echo "正在查找相同大小的文件..."
awk -F: '
{
size = $1
file = substr($0, index($0, ":") + 1)
sizes[size]++
files[size] = (files[size] ? files[size] "\n" : "") file
}
END {
for (size in sizes) {
if (sizes[size] > 1) {
split(files[size], file_array, "\n")
for (i in file_array) {
if (file_array[i] != "") {
print size ":" file_array[i]
}
}
}
}
}' "$SIZE_FILE" > "$TMP_DIR/same_size"
if [[ ! -s "$TMP_DIR/same_size" ]]; then
echo "没有找到大小相同的文件。"
rm -rf "$TMP_DIR"
echo "重复文件总数: 0"
echo "查重完成。"
exit 0
fi
echo "正在计算MD5哈希值..."
while IFS=: read -r size filepath; do
if [[ -f "$filepath" ]]; then
hash=$(md5sum "$filepath" 2>/dev/null | cut -d' ' -f1)
if [[ -n "$hash" ]]; then
echo "$hash:$size:$filepath" >> "$HASH_FILE"
fi
fi
done < "$TMP_DIR/same_size"
if [[ ! -s "$HASH_FILE" ]]; then
echo "没有找到重复文件。"
rm -rf "$TMP_DIR"
echo "重复文件总数: 0"
echo "查重完成。"
exit 0
fi
echo "正在分析重复文件..."
sort "$HASH_FILE" | awk -F: '
{
hash = $1
size = $2
file = substr($0, index($0, ":") + 1)
file = substr(file, index(file, ":") + 1)
if (hash in groups) {
groups[hash] = groups[hash] "\n" file
counts[hash]++
} else {
groups[hash] = file
counts[hash] = 1
}
}
END {
for (hash in groups) {
if (counts[hash] > 1) {
print "HASH:" hash
split(groups[hash], file_array, "\n")
for (i in file_array) {
if (file_array[i] != "") {
print "FILE:" file_array[i]
}
}
print "END"
}
}
}' > "$TMP_DIR/final_groups"
declare -a all_groups
group_hashes=()
current_hash=""
current_files=()
while IFS= read -r line; do
if [[ "$line" =~ ^HASH:(.+)$ ]]; then
if [[ -n "$current_hash" && ${#current_files[@]} -gt 1 ]]; then
group_hashes+=("$current_hash")
files_str=""
for f in "${current_files[@]}"; do
files_str="${files_str}${f}"$'\n'
done
all_groups+=("$files_str")
fi
current_hash="${BASH_REMATCH[1]}"
current_files=()
elif [[ "$line" =~ ^FILE:(.+)$ ]]; then
filepath="${BASH_REMATCH[1]}"
current_files+=("$filepath")
fi
done < "$TMP_DIR/final_groups"
if [[ -n "$current_hash" && ${#current_files[@]} -gt 1 ]]; then
group_hashes+=("$current_hash")
files_str=""
for f in "${current_files[@]}"; do
files_str="${files_str}${f}"$'\n'
done
all_groups+=("$files_str")
fi
for ((i=0; i<${#group_hashes[@]}; i++)); do
hash="${group_hashes[$i]}"
files_str="${all_groups[$i]}"
IFS=$'\n' read -rd '' -a group_files <<< "$files_str"
filtered_files=()
for f in "${group_files[@]}"; do
if [[ -n "$f" && -f "$f" ]]; then
filtered_files+=("$f")
fi
done
if [[ ${#filtered_files[@]} -lt 2 ]]; then
continue
fi
((DUPLICATE_GROUPS++))
echo ""
echo "============================================================"
echo "重复文件组 #$DUPLICATE_GROUPS (MD5: $hash)"
echo "============================================================"
for j in "${!filtered_files[@]}"; do
f="${filtered_files[$j]}"
FILE_SIZE=$(stat -c %s "$f" 2>/dev/null || echo "0")
FILE_TIME=$(stat -c "%y" "$f" 2>/dev/null || echo "未知时间")
echo " [$((j+1))] 大小: ${FILE_SIZE}字节"
echo " 修改时间: ${FILE_TIME}"
echo " 路径: $f"
echo ""
done
for f in "${filtered_files[@]:1}"; do
FILE_SIZE=$(stat -c %s "$f" 2>/dev/null || echo "0")
TOTAL_SIZE=$((TOTAL_SIZE + FILE_SIZE))
TOTAL_DUPLICATES=$((TOTAL_DUPLICATES + 1))
done
case "$MODE" in
"auto")
latest_file=""
latest_time=0
for f in "${filtered_files[@]}"; do
ftime=$(stat -c %Y "$f" 2>/dev/null || echo "0")
if [[ $ftime -gt $latest_time ]]; then
latest_time=$ftime
latest_file="$f"
fi
done
echo "🔄 自动模式:保留最新文件: $latest_file"
for f in "${filtered_files[@]}"; do
if [[ "$f" != "$latest_file" ]]; then
echo "🗑️ 删除: $f"
rm -f "$f"
fi
done
;;
"preview")
echo "👁️ 预览模式:发现 ${#filtered_files[@]} 个重复文件,不会删除"
;;
*)
echo "🤔 请选择要保留的文件(其余将被删除):"
echo " 输入文件编号(可输入多个,用空格分隔)"
echo " 直接按回车跳过这组文件"
echo ""
read -p "👉 请输入选择: " KEEP_CHOICE < /dev/tty
if [[ -n "$KEEP_CHOICE" ]]; then
KEEP_ARRAY=($KEEP_CHOICE)
valid_choices=()
for choice in "${KEEP_ARRAY[@]}"; do
if [[ "$choice" =~ ^[0-9]+$ ]] && [[ $choice -ge 1 ]] && [[ $choice -le ${#filtered_files[@]} ]]; then
valid_choices+=($choice)
else
echo "⚠️ 无效选择: $choice(已忽略)"
fi
done
if [[ ${#valid_choices[@]} -gt 0 ]]; then
echo ""
echo "🔄 处理中..."
for j in "${!filtered_files[@]}"; do
file_num=$((j+1))
should_keep=false
for keep_num in "${valid_choices[@]}"; do
if [[ $file_num -eq $keep_num ]]; then
should_keep=true
break
fi
done
if [[ "$should_keep" == "false" ]]; then
f="${filtered_files[$j]}"
echo "🗑️ 删除: $f"
rm -f "$f"
else
echo "💾 保留: ${filtered_files[$j]}"
fi
done
echo "✅ 本组处理完成"
else
echo "⏭️ 跳过此组(无有效选择)"
fi
else
echo "⏭️ 跳过此组"
fi
;;
esac
echo ""
done
rm -rf "$TMP_DIR"
echo "============================== 总结 =============================="
echo "📊 扫描目录: ${VALID_DIRS[*]}"
echo "📁 重复文件组数: $DUPLICATE_GROUPS"
echo "📄 重复文件总数: $TOTAL_DUPLICATES"
if [[ $TOTAL_SIZE -gt 0 ]]; then
echo "💾 可释放空间: $TOTAL_SIZE 字节 ($(numfmt --to=iec-i --suffix=B $TOTAL_SIZE 2>/dev/null || echo "${TOTAL_SIZE}B"))"
else
echo "💾 可释放空间: 0 字节 (0B)"
fi
echo "✅ 查重完成"
echo "=================================================================="
使用方法:
预览模式(只显示重复文件,不删除)
./duplicate_finder_delete.sh <要查重的目录路径> preview
交互模式(默认)
./duplicate_finder_delete.sh <要查重的目录路径>
自动模式(保留最新修改的文件,其余删除)
./duplicate_finder_delete.sh <要查重的目录路径> auto
三种模式:
-
preview → 安全查看重复文件
可以在不删除任何文件的情况下,只显示重复文件组、大小、修改时间,方便确认后再决定交互或自动删除。
-
交互 → 手动选择保留
手动选择保留,删除剩余重复文件。
-
auto → 自动保留最新文件,批量清理
自动保留修改时间最新的文件,其余全部删除。