# -*- coding: utf-8 -*-
"""从 guwendao 园冶书目页抓取篇目、分节与每行正文，生成园冶_篇目与分节.csv/tsv。"""
import csv
import re
import urllib.request

from bs4 import BeautifulSoup

BASE = "https://www.guwendao.net"
BOOK_URL = f"{BASE}/guwen/book_8c007e6f6685.aspx"
UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"

# 篇内小节：段首为全角括号 + 中文数字 + 全角闭括号
SUBSECTION_HEAD = re.compile(r"^（[一二三四五六七八九十百千万]+）")


def fetch(url: str) -> str:
    req = urllib.request.Request(url, headers={"User-Agent": UA})
    with urllib.request.urlopen(req, timeout=60) as resp:
        return resp.read().decode("utf-8", errors="replace")


def parse_toc(html: str) -> list[tuple[str, str, str]]:
    """返回 [(卷名, 篇名, 绝对URL), ...] 按页面顺序。"""
    soup = BeautifulSoup(html, "html.parser")
    rows: list[tuple[str, str, str]] = []
    current_juan = ""
    for div in soup.select("div.bookMl"):
        strong = div.find("strong")
        if strong:
            current_juan = strong.get_text(strip=True)
        nxt = div.find_next_sibling()
        while nxt and "bookMl" not in (nxt.get("class") or []):
            for a in nxt.find_all("a", href=True):
                href = a["href"].strip()
                title = a.get_text(strip=True)
                if not title or "bookv_" not in href:
                    nxt = nxt.find_next_sibling()
                    continue
                if href.startswith("/"):
                    href = BASE + href
                rows.append((current_juan, title, href))
            nxt = nxt.find_next_sibling()
    return rows


def is_subsection_heading(t: str) -> bool:
    if not t or not SUBSECTION_HEAD.match(t):
        return False
    if len(t) > 120:
        return False
    if len(t) > 55 and "。" in t[:40]:
        return False
    return True


def chapter_intro_and_sections(html: str) -> tuple[str, list[tuple[str, str]]]:
    """
    返回 (篇级正文, [(分节标题, 分节正文), ...])。
    无（一）（二）式分节时：篇级正文为整篇拼接，第二项为空列表。
    有分节时：篇级正文为第一节之前的导语；各分节正文为标题段之后至下一标题前的段落。
    """
    soup = BeautifulSoup(html, "html.parser")
    cont = soup.select_one("div.contson")
    if not cont:
        return "", []
    ps = [p.get_text(strip=True) for p in cont.find_all("p")]
    ps = [t for t in ps if t]
    if not ps:
        return "", []

    indices = [i for i, t in enumerate(ps) if is_subsection_heading(t)]
    if not indices:
        return ("\n".join(ps), [])

    intro = "\n".join(ps[0 : indices[0]])
    sections: list[tuple[str, str]] = []
    for j, idx in enumerate(indices):
        title = ps[idx]
        end = indices[j + 1] if j + 1 < len(indices) else len(ps)
        body = "\n".join(ps[idx + 1 : end])
        sections.append((title, body))
    return (intro, sections)


def write_table(table: list[dict[str, str]], base_name: str) -> None:
    folder = r"C:\Users\as\Desktop\同济讲座"
    fields = [
        "上一个分类",
        "篇目或分节",
        "正文",
        "白话翻译",
        "设计方法通俗说明",
        "底层哲学通俗说",
        "底层心理学通俗说",
        "当时社会文化通俗说",
        "页面地址",
    ]
    for ext, delim in (("tsv", "\t"), ("csv", ",")):
        path = f"{folder}\\{base_name}.{ext}"
        with open(path, "w", encoding="utf-8-sig", newline="") as f:
            w = csv.DictWriter(f, fieldnames=fields, delimiter=delim)
            w.writeheader()
            w.writerows(table)
        print(path)


def main() -> None:
    book_html = fetch(BOOK_URL)
    toc = parse_toc(book_html)

    rows: list[dict[str, str]] = []

    for juan, pian, url in toc:
        ch_html = fetch(url)
        intro, sections = chapter_intro_and_sections(ch_html)

        empty_an = {
            "设计方法通俗说明": "",
            "底层哲学通俗说": "",
            "底层心理学通俗说": "",
            "当时社会文化通俗说": "",
        }
        rows.append(
            {
                "上一个分类": juan,
                "篇目或分节": pian,
                "正文": intro,
                "白话翻译": "",
                **empty_an,
                "页面地址": url,
            }
        )

        for sub_title, sub_body in sections:
            rows.append(
                {
                    "上一个分类": pian,
                    "篇目或分节": sub_title,
                    "正文": sub_body,
                    "白话翻译": "",
                    **empty_an,
                    "页面地址": url,
                }
            )

    write_table(rows, "园冶_篇目与分节")

    print("rows", len(rows))


if __name__ == "__main__":
    main()
