# -*- coding: utf-8 -*-
import os
import sys
#import markdown
import ebooklib
from ebooklib import epub
from bs4 import BeautifulSoup
import urllib.parse
import argparse
from loguru import logger
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--title",
help="new book title",
type=str,
default=None
)
parser.add_argument(
"-f",
help="from file",
type=str,
default=None
)
parser.add_argument(
"-t",
help="to file",
type=str,
default=None
)
parser.add_argument(
"-u",
help="file for url list",
required=True,
type=str,
default=None
)
#parser.add_argument(
# "-s",
# action="store_true",
# help="show spine",
# default=False
# )
#parser.add_argument(
# "-d",
# action="store_true",
# help="show doc",
# default=False
# )
#parser.add_argument(
# "-t",
# action="store_true",
# help="show toc",
# default=False
# )
#parser.add_argument(
# "files",
# nargs='*',
# metavar='N',
# type=str,
# help="file list",
# default=''
# )
args = parser.parse_args()
return args
def get_hrefs(e, hrefs, filter_set):
if isinstance(e, epub.Section):
url = urllib.parse.unquote(e.href)
if url not in filter_set:
return None
logger.debug(f"url: {url}")
hrefs.append(url)
return None
if isinstance(e, list) or isinstance(e, tuple):
for d in e:
get_hrefs(d, hrefs, filter_set)
return None
if isinstance(e, epub.Link):
url = urllib.parse.unquote(e.href)
if url not in filter_set:
return None
logger.debug(f"url: {url}")
hrefs.append(url)
return None
logger.error("unknown type: {}".format(type(e)))
raise Exception("unknown type: {}".format(type(e)))
def get_html(book, html_filter=None):
htmls = list()
for d in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
if html_filter == None:
htmls.append(d)
continue
file_name = d.file_name
logger.debug(f"file_name: {file_name}")
if file_name in html_filter:
htmls.append(d)
continue
return htmls
def read_urls(args):
urls = list()
with open(args.u, 'r') as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
blocks = line.split("-@-")
title, url = blocks[0], blocks[1]
urls.append((title, url))
return urls
def split_ebook(args):
title = args.title
urls = read_urls(args)
if not title:
title = urls[0][0]
urlsset = set([u[1] for u in urls])
logger.info("urls count: {}".format(len(urlsset)))
srcbook = epub.read_epub(args.f)
newbook = epub.EpubBook()
# set metadata
newbook.set_identifier("CR12E0-b7da687a3f9947aba3a73c20577be3d3")
newbook.set_title(title)
newbook.set_language("zh")
newbook.add_author("罗杰")
# basic spine
newbook.spine = ["nav"]
hrefs = list()
get_hrefs(srcbook.toc, hrefs, filter_set=urlsset)
logger.info("hrefs count: {}".format(len(hrefs)))
hrefs = [h.strip().split("#")[0] for h in hrefs]
hrefs = set(hrefs)
htmls = get_html(srcbook, html_filter=hrefs)
logger.info("htmls count: {}".format(len(htmls)))
all_htmls = get_html(srcbook)
all_html_dict = dict()
for h in all_htmls:
all_html_dict[h.file_name] = h
html_dict = dict()
for h in htmls:
html_dict[h.file_name] = h
sorted_htmls = list()
for url in urls:
file_name = url[1].strip().split("#")[0]
h = html_dict.get(file_name, None)
if not h:
logger.warning("no such html for {}".format(file_name))
continue
sorted_htmls.append(h)
#soup = BeautifulSoup(h.content, "html.parser" )
#h.content = soup.prettify().encode()
h.title = url[0]
logger.debug("title: {}, file_name: {}".format(h.title, h.file_name))
newbook.add_item(h)
newbook.spine.append(h)
# define Table Of Contents
newbook.toc = [epub.Link(sorted_htmls[i].file_name, sorted_htmls[i].title, "") for i in range(len(sorted_htmls))]
# add default NCX and Nav file
newbook.add_item(epub.EpubNcx())
newbook.add_item(epub.EpubNav())
# define CSS style
style = "BODY {color: white;} code {color: #C0C0C0}"
nav_css = epub.EpubItem(
uid="style_nav",
file_name="style/nav.css",
media_type="text/css",
content=style,
)
# add CSS file
newbook.add_item(nav_css)
# write to the file
epub.write_epub("{}".format(args.t), newbook, {})
def main(args):
split_ebook(args)
if "__main__" == __name__:
args = parse_args()
main(args)