使用 ebooklib 拆分 epub 电子书

创建日期: 2024-07-31 17:40 | 作者: 风波 | 浏览次数: 14 | 分类: Python
# -*- coding: utf-8 -*-

import os
import sys
#import markdown
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
import urllib.parse
import argparse
from loguru import logger


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
            "--title",
            help="new book title",
            type=str,
            default=None
            )
    parser.add_argument(
            "-f",
            help="from file",
            type=str,
            default=None
            )
    parser.add_argument(
            "-t",
            help="to file",
            type=str,
            default=None
            )
    parser.add_argument(
            "-u",
            help="file for url list",
            required=True,
            type=str,
            default=None
            )
    #parser.add_argument(
    #        "-s",
    #        action="store_true",
    #        help="show spine",
    #        default=False
    #        )
    #parser.add_argument(
    #        "-d",
    #        action="store_true",
    #        help="show doc",
    #        default=False
    #        )
    #parser.add_argument(
    #        "-t",
    #        action="store_true",
    #        help="show toc",
    #        default=False
    #        )
    #parser.add_argument(
    #        "files",
    #        nargs='*',
    #        metavar='N',
    #        type=str,
    #        help="file list",
    #        default=''
    #        )
    args = parser.parse_args()
    return args



def get_hrefs(e, hrefs, filter_set):
    if isinstance(e, epub.Section):
        url = urllib.parse.unquote(e.href)
        if url not in filter_set:
            return None
        logger.debug(f"url: {url}")
        hrefs.append(url)
        return None
    if isinstance(e, list) or isinstance(e, tuple):
        for d in e:
            get_hrefs(d, hrefs, filter_set)
        return None
    if isinstance(e, epub.Link):
        url = urllib.parse.unquote(e.href)
        if url not in filter_set:
            return None
        logger.debug(f"url: {url}")
        hrefs.append(url)
        return None
    logger.error("unknown type: {}".format(type(e)))
    raise Exception("unknown type: {}".format(type(e)))


def get_html(book, html_filter=None):
    htmls = list()
    for d in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        if html_filter == None:
            htmls.append(d)
            continue
        file_name = d.file_name
        logger.debug(f"file_name: {file_name}")
        if file_name in html_filter:
            htmls.append(d)
            continue
    return htmls


def read_urls(args):
    urls = list()
    with open(args.u, 'r') as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            blocks = line.split("-@-")
            title, url = blocks[0], blocks[1]
            urls.append((title, url))
    return urls


def split_ebook(args):
    title = args.title
    urls = read_urls(args)
    if not title:
        title = urls[0][0]
    urlsset = set([u[1] for u in urls])
    logger.info("urls count: {}".format(len(urlsset)))

    srcbook = epub.read_epub(args.f)
    newbook = epub.EpubBook()
    # set metadata
    newbook.set_identifier("CR12E0-b7da687a3f9947aba3a73c20577be3d3")
    newbook.set_title(title)
    newbook.set_language("zh")
    newbook.add_author("罗杰")

    # basic spine
    newbook.spine = ["nav"]
    hrefs = list()
    get_hrefs(srcbook.toc, hrefs, filter_set=urlsset)
    logger.info("hrefs count: {}".format(len(hrefs)))
    hrefs = [h.strip().split("#")[0] for h in hrefs]
    hrefs = set(hrefs)
    htmls = get_html(srcbook, html_filter=hrefs)
    logger.info("htmls count: {}".format(len(htmls)))

    all_htmls = get_html(srcbook)
    all_html_dict = dict()
    for h in all_htmls:
        all_html_dict[h.file_name] = h

    html_dict = dict()
    for h in htmls:
        html_dict[h.file_name] = h

    sorted_htmls = list()

    for url in urls:
        file_name = url[1].strip().split("#")[0]
        h = html_dict.get(file_name, None)
        if not h:
            logger.warning("no such html for {}".format(file_name))
            continue
        sorted_htmls.append(h)
        #soup = BeautifulSoup(h.content, "html.parser" )
        #h.content = soup.prettify().encode()
        h.title = url[0]
        logger.debug("title: {}, file_name: {}".format(h.title, h.file_name))
        newbook.add_item(h)
        newbook.spine.append(h)

    # define Table Of Contents
    newbook.toc = [epub.Link(sorted_htmls[i].file_name, sorted_htmls[i].title, "") for i in range(len(sorted_htmls))]

    # add default NCX and Nav file
    newbook.add_item(epub.EpubNcx())
    newbook.add_item(epub.EpubNav())

    # define CSS style
    style = "BODY {color: white;} code {color: #C0C0C0}"
    nav_css = epub.EpubItem(
        uid="style_nav",
        file_name="style/nav.css",
        media_type="text/css",
        content=style,
    )

    # add CSS file
    newbook.add_item(nav_css)

    # write to the file
    epub.write_epub("{}".format(args.t), newbook, {})


def main(args):
    split_ebook(args)


if "__main__" == __name__:
    args = parse_args()
    main(args)
14 浏览
8 爬虫
0 评论