import os
import re

# 配置路径
INPUT_DIR = r"C:\Users\lenovo\Desktop\ea_rag_project\content"
OUTPUT_DIR = os.path.join(os.path.dirname(INPUT_DIR), "markdown_output")
os.makedirs(OUTPUT_DIR, exist_ok=True)

def extract_markdown_from_text(text):
    lines = text.split('\n')
    md_lines = []
    seen_headings = set()  # 用于去重

    for line in lines:
        stripped = line.strip()
        if not stripped:
            continue

        # 匹配章节标题：支持 "1", "2.1", "3.4.5", "附录A", "本规范用词说明" 等
        heading_match = re.match(r'^(\d+(\.\d+)*|附录[ABCabc]|本规范用词说明|公告|前言|目次)\s*(.*)', stripped)
        if heading_match:
            full_num = heading_match.group(1)
            title_part = heading_match.group(3).strip()

            # 构建完整标题文本（如 "3.1 缘石坡道"）
            full_title = f"{full_num} {title_part}".strip()
            if not title_part:  # 如果只有编号（如 "3.1" 后无文字），保留编号
                full_title = full_num

            # 去重：避免重复标题（如文档中“3.1 缘石坡道”出现两次）
            if full_title in seen_headings:
                continue
            seen_headings.add(full_title)

            # 判断层级
            if full_num in ['公告', '前言', '目次']:
                level = 2
            elif full_num == '本规范用词说明':
                level = 2
            elif re.match(r'^附录[ABCabc]$', full_num):
                level = 2
            elif '.' not in full_num and full_num.isdigit():  # 如 "1", "2"
                level = 2
            elif len(full_num.split('.')) == 2:  # 如 "3.1"
                level = 3
            elif len(full_num.split('.')) >= 3:  # 如 "3.1.1"
                level = 4
            else:
                level = 2  # 默认降级处理

            # 限制最大层级为4（Markdown 最好不超过4级）
            level = min(level, 4)
            md_lines.append(f"{'#' * level} {full_title}\n")
        else:
            # 普通段落：清理多余空格，保留原文
            cleaned = re.sub(r'\s+', ' ', stripped)
            if cleaned:
                md_lines.append(f"{cleaned}\n")

    return md_lines

def process_file(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as f:
        content = f.read()

    md_content = extract_markdown_from_text(content)

    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(''.join(md_content))

    print(f"✅ 已生成：{output_path}")

# 主程序：处理 content 目录下所有 .txt 或 .md（如果你已提取为文本）
if __name__ == "__main__":
    # 注意：此脚本假设你已将 .docx 转为纯文本（如 50763.txt）
    # 如果你仍只有 .docx，请先用 python-docx 提取全文为字符串再保存为 .txt

    text_files = [f for f in os.listdir(INPUT_DIR) if f.lower().endswith(('.txt', '.md'))]
    if not text_files:
        print("⚠️ 请先将 .docx 转换为纯文本文件（如 50763.txt），放在 content 目录下")
        exit(1)

    for filename in text_files:
        input_path = os.path.join(INPUT_DIR, filename)
        output_filename = os.path.splitext(filename)[0] + ".md"
        output_path = os.path.join(OUTPUT_DIR, output_filename)
        process_file(input_path, output_path)

    print(f"\n🎉 所有文件已转换完毕！输出目录：{OUTPUT_DIR}")