~cytrogen/kobo-manga

ref: 4e504823f4bf8d2b5f4279da3f4d4ebe98fc97ad kobo-manga/src/kobo_manga/downloader/basic.py -rw-r--r-- 3.1 KiB
4e504823 — HallowDem Initial commit: kobo-manga pipeline a day ago
                                                                                
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""基础图片下载器"""

import asyncio
from pathlib import Path

import httpx

from kobo_manga.config import AppConfig
from kobo_manga.models import PageImage

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
}


async def download_image(
    client: httpx.AsyncClient,
    image: PageImage,
    output_dir: Path,
    referer: str,
    max_retry: int = 3,
) -> PageImage:
    """下载单张图片,返回更新了 local_path 的 PageImage。"""
    # 从 URL 推断扩展名
    ext = Path(image.url.split("?")[0]).suffix or ".jpg"
    filename = f"{image.page_number:03d}{ext}"
    filepath = output_dir / filename

    if filepath.exists():
        image.local_path = str(filepath)
        return image

    for attempt in range(max_retry):
        try:
            resp = await client.get(
                image.url,
                headers={"Referer": referer},
            )
            resp.raise_for_status()

            filepath.parent.mkdir(parents=True, exist_ok=True)
            filepath.write_bytes(resp.content)
            image.local_path = str(filepath)
            return image

        except (httpx.HTTPError, OSError) as e:
            if attempt == max_retry - 1:
                raise RuntimeError(
                    f"下载失败 (第{image.page_number}页): {e}"
                ) from e
            await asyncio.sleep(1.0 * (attempt + 1))

    return image


async def download_chapter(
    images: list[PageImage],
    output_dir: Path,
    referer: str,
    config: AppConfig | None = None,
) -> list[PageImage]:
    """下载一个章节的所有图片。

    Args:
        images: 图片列表(含 URL)
        output_dir: 输出目录
        referer: Referer header
        config: 应用配置(用于并发数、重试数等)

    Returns:
        更新了 local_path 的图片列表
    """
    concurrent = 3
    max_retry = 3
    delay = 1.0

    if config:
        concurrent = config.download.concurrent
        max_retry = config.download.retry
        delay = config.download.delay

    output_dir.mkdir(parents=True, exist_ok=True)

    semaphore = asyncio.Semaphore(concurrent)

    async with httpx.AsyncClient(
        headers=HEADERS,
        follow_redirects=True,
        timeout=60.0,
    ) as client:

        async def _download_with_limit(img: PageImage) -> PageImage:
            async with semaphore:
                result = await download_image(
                    client, img, output_dir, referer, max_retry
                )
                await asyncio.sleep(delay)
                return result

        tasks = [_download_with_limit(img) for img in images]
        results = await asyncio.gather(*tasks, return_exceptions=True)

    downloaded = []
    errors = []
    for r in results:
        if isinstance(r, Exception):
            errors.append(r)
        else:
            downloaded.append(r)

    if errors:
        print(f"  ⚠ {len(errors)} 张图片下载失败:")
        for e in errors:
            print(f"    - {e}")

    return downloaded