以下代码给遇到相同问题的朋友(需要有python环境),用于自动复制缺失文件,超长的可自动截断,其中目录都是通过smb映射到一台windows上,然后用python运行
import os
import shutil
import logging
import sys
from pathlib import Path
from datetime import datetime
# 配置参数 - 可根据实际需求修改
SOURCE_DIRECTORY = Path("D:/test1") # 源目录,检查此目录中的文件
TARGET_COMPARISON_DIRECTORY = Path("X:/test1") # 目标目录,与源目录比较
COPY_TARGET_DIRECTORY = Path("X:/test1") # 复制缺失文件的目标位置
MISSING_FILE_LOG = Path("遗漏的文件.txt") # 记录遗漏文件的日志文件
SHOULD_TRUNCATE_FILENAME = True # 是否截断超长文件名
SHOULD_COPY_TO_TARGET = True # 是否执行文件复制操作
MAX_WINDOWS_PATH_LENGTH = 4096 # 最大路径长度限制(Windows标准)
MAX_WINDOWS_FILENAME_LENGTH = 255 # 最大文件名长度限制(Windows标准)
SHOULD_PRINT_FILENAME_STATS = True # 是否打印文件名处理统计信息
SHOULD_CHECK_MISSING = True # 是否检测文件缺失
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(sys.stdout)
]
)
LOGGER = logging.getLogger(__name__)
def get_file_list(root_directory):
"""获取指定目录下所有文件的相对路径集合"""
file_relative_path_set = set()
for root, _, files in os.walk(root_directory):
for file in files:
full_path = Path(root) / file
relative_path = full_path.relative_to(root_directory)
file_relative_path_set.add(str(relative_path))
return file_relative_path_set
def get_byte_length(input_string):
"""获取字符串的UTF-8字节长度"""
try:
return len(input_string.encode('utf-8'))
except:
return len(input_string) # 回退到字符长度
def truncate_path_component(path_component, max_byte_length):
"""截断路径组件(文件名或目录名),确保不超过指定的字节长度"""
if get_byte_length(path_component) <= max_byte_length:
return path_component
# 逐字符构建,确保不超过最大字节长度
truncated_component = ""
current_byte_length = 0
for c in path_component:
char_string = c
char_byte_length = get_byte_length(char_string)
if current_byte_length + char_byte_length > max_byte_length:
break
truncated_component += c
current_byte_length += char_byte_length
return truncated_component
def ensure_path_length(source_file_relative_path, target_base_directory):
"""确保目标路径长度符合限制,必要时截断路径组件"""
if not SHOULD_TRUNCATE_FILENAME:
return target_base_directory / source_file_relative_path
source_file_path = Path(source_file_relative_path)
initial_target_path = target_base_directory / source_file_relative_path
# 第一步:校验并截断每个路径组件(文件名和文件夹名)
path_components = []
for part in source_file_path.parts:
component = str(part)
component_byte_length = get_byte_length(component)
if component_byte_length > MAX_WINDOWS_FILENAME_LENGTH:
file_extension = Path(component).suffix
max_component_length = MAX_WINDOWS_FILENAME_LENGTH - (len(file_extension) if file_extension else 0)
truncated_component = truncate_path_component(component, max_component_length)
path_components.append(truncated_component + file_extension)
if SHOULD_PRINT_FILENAME_STATS:
LOGGER.info(f"组件截断: {component_byte_length} 字节 -> {get_byte_length(truncated_component)} 字节")
LOGGER.info(f"原始组件: {component}")
LOGGER.info(f"截断后组件: {truncated_component}")
else:
path_components.append(component)
# 重新构建路径
processed_path = target_base_directory
for component in path_components:
processed_path = processed_path / component
# 第二步:校验完整路径长度
full_path_byte_length = get_byte_length(str(processed_path))
if full_path_byte_length <= MAX_WINDOWS_PATH_LENGTH:
return processed_path
# 如果完整路径仍然超长,进行路径整体截断
if SHOULD_PRINT_FILENAME_STATS:
LOGGER.info(f"完整路径超长: {full_path_byte_length} 字节 (限制: {MAX_WINDOWS_PATH_LENGTH} 字节)")
# 分离目录和文件名
parent_directory = processed_path.parent
file_name = processed_path.name
# 截断目录部分
directory_components = list(parent_directory.parts) if parent_directory else []
# 逐步缩短目录组件,直到路径符合要求
is_path_truncated = False
while directory_components:
# 尝试缩短最后一个目录组件
last_index = len(directory_components) - 1
last_directory_component = directory_components[last_index]
if len(last_directory_component) > 10: # 只缩短较长的组件
shortened_component = truncate_path_component(last_directory_component, len(last_directory_component) // 2)
directory_components[last_index] = shortened_component
# 重新构建路径并检查长度
new_parent_directory = target_base_directory
for component in directory_components:
new_parent_directory = new_parent_directory / component
new_path = new_parent_directory / file_name
if get_byte_length(str(new_path)) <= MAX_WINDOWS_PATH_LENGTH:
is_path_truncated = True
processed_path = new_path
if SHOULD_PRINT_FILENAME_STATS:
LOGGER.info(f"路径整体截断成功: {full_path_byte_length} 字节 -> {get_byte_length(str(new_path))} 字节")
break
# 如果缩短后仍然超长,移除最后一个目录组件
directory_components.pop()
# 如果路径仍然超长,截断文件名
if not is_path_truncated and get_byte_length(str(processed_path)) > MAX_WINDOWS_PATH_LENGTH:
# 正确分离文件名和扩展名
base_file_name = file_name
file_extension = ""
dot_index = file_name.rfind('.')
if dot_index > 0 and dot_index < len(file_name) - 1:
base_file_name = file_name[:dot_index]
file_extension = file_name[dot_index:]
# 计算还能保留的文件名长度
current_parent_directory = processed_path.parent
parent_directory_byte_length = get_byte_length(str(current_parent_directory)) if current_parent_directory else 0
allowed_file_name_byte_length = MAX_WINDOWS_PATH_LENGTH - parent_directory_byte_length - 1 # 减去路径分隔符长度
if allowed_file_name_byte_length > 0:
# 预留空间给数字后缀 (最多 "_9999")
reserved_byte_length = 6
max_base_file_name_length = max(1, allowed_file_name_byte_length - get_byte_length(file_extension) - reserved_byte_length)
truncated_base_file_name = truncate_path_component(base_file_name, max_base_file_name_length)
new_file_name = truncated_base_file_name + file_extension
processed_path = current_parent_directory / new_file_name
if SHOULD_PRINT_FILENAME_STATS:
LOGGER.info(f"文件名二次截断: {get_byte_length(file_name)} 字节 -> {get_byte_length(new_file_name)} 字节")
LOGGER.info(f"原始文件名: {file_name}")
LOGGER.info(f"二次截断后文件名: {new_file_name}")
return processed_path
def handle_duplicate_filename(target_directory, original_file_name):
"""处理重复文件名,添加数字后缀生成唯一文件名"""
# 分离文件名和扩展名
base_file_name = original_file_name
file_extension = ""
dot_index = original_file_name.rfind('.')
# 确保正确分离扩展名
if dot_index > 0:
base_file_name = original_file_name[:dot_index]
file_extension = original_file_name[dot_index:]
# 计算允许的最大基础名称长度(不含扩展名和后缀)
max_base_file_name_length = MAX_WINDOWS_FILENAME_LENGTH - get_byte_length(file_extension) - 6 # 减去后缀"_9999"的最大长度
# 截断基础名称,确保有足够空间添加数字后缀
truncated_base_file_name = truncate_path_component(base_file_name, max(max_base_file_name_length, 1))
counter = 1
while True:
suffix = f"_{counter}"
new_file_name = truncated_base_file_name + suffix + file_extension
new_file_path = target_directory / new_file_name
# 验证新文件名长度
if get_byte_length(new_file_name) > MAX_WINDOWS_FILENAME_LENGTH:
# 如果添加后缀后超长,进一步截断基础名称
reduce_by = get_byte_length(new_file_name) - MAX_WINDOWS_FILENAME_LENGTH
truncated_base_file_name = truncate_path_component(truncated_base_file_name, max(1, len(truncated_base_file_name) - reduce_by))
new_file_name = truncated_base_file_name + suffix + file_extension
new_file_path = target_directory / new_file_name
if not new_file_path.exists():
if SHOULD_PRINT_FILENAME_STATS:
LOGGER.info("文件名重复处理:")
LOGGER.info(f"原始文件名: {original_file_name}")
LOGGER.info(f"生成唯一文件名: {new_file_name}")
return new_file_path
counter += 1
def copy_missing_files(missing_file_relative_paths):
"""复制缺失的文件到目标目录,处理路径长度和重复文件名"""
missing_file_count = len(missing_file_relative_paths)
successful_copy_count = 0
failed_copy_count = 0
try:
with open(MISSING_FILE_LOG, 'w', encoding='utf-8') as writer:
for relative_path in missing_file_relative_paths:
source_file_path = SOURCE_DIRECTORY / relative_path
target_file_path = ensure_path_length(relative_path, COPY_TARGET_DIRECTORY)
# 记录遗漏的文件
writer.write(f"{source_file_path}\n")
LOGGER.info(f"遗漏文件: {source_file_path}")
if not SHOULD_COPY_TO_TARGET:
continue
try:
# 确保目标目录存在
target_directory = target_file_path.parent
target_directory.mkdir(parents=True, exist_ok=True)
# 处理文件名重复
if target_file_path.exists():
target_file_path = handle_duplicate_filename(target_directory, target_file_path.name)
# 复制文件(保留元数据)
shutil.copy2(source_file_path, target_file_path)
LOGGER.info(f"成功复制: {source_file_path} -> {target_file_path}")
successful_copy_count += 1
except Exception as e:
LOGGER.error(f"复制失败: {source_file_path} -> {target_file_path}, 错误: {str(e)}")
failed_copy_count += 1
except Exception as e:
LOGGER.error(f"写入日志文件失败: {str(e)}")
LOGGER.info(f"检查完成: 共发现 {missing_file_count} 个缺失文件")
if SHOULD_COPY_TO_TARGET:
LOGGER.info(f"复制结果: 成功 {successful_copy_count} 个, 失败 {failed_copy_count} 个")
LOGGER.info(f"遗漏的文件已记录到: {MISSING_FILE_LOG}")
def main():
"""主函数"""
LOGGER.info(f"开始操作,是否检测文件缺失: {SHOULD_CHECK_MISSING}")
try:
if SHOULD_CHECK_MISSING:
# 获取目录A和目录B的文件列表
source_directory_file_relative_paths = get_file_list(SOURCE_DIRECTORY)
target_comparison_directory_file_relative_paths = get_file_list(TARGET_COMPARISON_DIRECTORY)
# 找出在A中但不在B中的文件(集合差运算)
missing_file_relative_paths = source_directory_file_relative_paths - target_comparison_directory_file_relative_paths
else:
missing_file_relative_paths = get_file_list(SOURCE_DIRECTORY)
# 复制缺失的文件到目录C
copy_missing_files(missing_file_relative_paths)
except Exception as e:
LOGGER.error(f"操作失败: {str(e)}", exc_info=True)
if __name__ == "__main__":
main()