From 1c3f1c4245e453838ecdbc48a69caa42651af521 Mon Sep 17 00:00:00 2001 From: Joakim Holm Date: Thu, 27 Apr 2023 23:04:14 +0200 Subject: [PATCH] Implement Royal Road Source --- grawlix/book.py | 15 +++++++- grawlix/output/__init__.py | 14 +++++--- grawlix/output/cbz.py | 12 ++++--- grawlix/output/epub.py | 62 +++++++++++++++++++++++++++++++-- grawlix/output/output_format.py | 25 +++++++++---- grawlix/sources/__init__.py | 2 ++ grawlix/sources/royal_road.py | 45 ++++++++++++++++++++++++ pyproject.toml | 1 + shell.nix | 13 +++++++ 9 files changed, 171 insertions(+), 18 deletions(-) create mode 100644 grawlix/sources/royal_road.py diff --git a/grawlix/book.py b/grawlix/book.py index d977823..c1ec9d8 100644 --- a/grawlix/book.py +++ b/grawlix/book.py @@ -46,9 +46,21 @@ class ImageList: """ images: list[OnlineFile] +@dataclass(slots=True) +class HtmlFile: + title: str + file: OnlineFile + selector: Optional[dict[str, str]] + +@dataclass(slots=True) +class HtmlFiles: + cover: OnlineFile + htmlfiles: list[HtmlFile] + BookData = Union[ SingleFile, - ImageList + ImageList, + HtmlFiles ] @dataclass(slots=True) @@ -56,6 +68,7 @@ class Book: """Stores information about a book""" metadata: Metadata data: BookData + overwrite: bool = False T = TypeVar("T") diff --git a/grawlix/output/__init__.py b/grawlix/output/__init__.py index 26e8e27..33f5c2c 100644 --- a/grawlix/output/__init__.py +++ b/grawlix/output/__init__.py @@ -1,10 +1,10 @@ -from grawlix.book import Book, BookData, SingleFile, ImageList, OnlineFile +from grawlix.book import Book, BookData, SingleFile, ImageList, OnlineFile, HtmlFiles from grawlix.exceptions import GrawlixError from grawlix.logging import info from .output_format import OutputFormat -from .epub import Epub from .cbz import Cbz +from .epub import Epub from typing import Callable from pathlib import Path @@ -18,16 +18,18 @@ async def download_book(book: Book, update_func: Callable, template: str) -> Non """ output_format = get_default_format(book.data) location = format_output_location(book, output_format, template) - if os.path.exists(location): + if not book.overwrite and os.path.exists(location): info("Skipping - File already exists") return parent = Path(location).parent if not parent.exists(): os.makedirs(parent) if isinstance(book.data, SingleFile): - await output_format.dl_single_file(book.data, location, update_func) + await output_format.dl_single_file(book, location, update_func) elif isinstance(book.data, ImageList): - await output_format.dl_image_list(book.data, location, update_func) + await output_format.dl_image_list(book, location, update_func) + elif isinstance(book.data, HtmlFiles): + await output_format.dl_html_files(book, location, update_func) else: raise NotImplementedError await output_format.close() @@ -58,6 +60,8 @@ def get_default_format(bookdata: BookData) -> OutputFormat: return output_format_from_str(bookdata.file.extension) if isinstance(bookdata, ImageList): return Cbz() + if isinstance(bookdata, HtmlFiles): + return Epub() raise GrawlixError diff --git a/grawlix/output/cbz.py b/grawlix/output/cbz.py index 4f7ac19..54ac63a 100644 --- a/grawlix/output/cbz.py +++ b/grawlix/output/cbz.py @@ -1,5 +1,6 @@ -from .output_format import OutputFormat, Update +from .output_format import OutputFormat, Update, Book from grawlix.book import ImageList, OnlineFile +from grawlix.exceptions import UnsupportedOutputFormat from zipfile import ZipFile import asyncio @@ -9,8 +10,11 @@ class Cbz(OutputFormat): extension: str = "cbz" - async def dl_image_list(self, book: ImageList, location: str, update: Update) -> None: - image_count = len(book.images) + async def dl_image_list(self, book: Book, location: str, update: Update) -> None: + if not isinstance(book.data, ImageList): + raise UnsupportedOutputFormat + images = book.data.images + image_count = len(images) with ZipFile(location, mode="w") as zip: async def download_page(index: int, file: OnlineFile): content = await self._download_file(file) @@ -19,6 +23,6 @@ class Cbz(OutputFormat): update(1/image_count) tasks = [ asyncio.create_task(download_page(index, file)) - for index, file in enumerate(book.images) + for index, file in enumerate(images) ] await asyncio.wait(tasks) diff --git a/grawlix/output/epub.py b/grawlix/output/epub.py index ef634a0..39bc683 100644 --- a/grawlix/output/epub.py +++ b/grawlix/output/epub.py @@ -1,5 +1,63 @@ -from grawlix.book import Book, SingleFile -from .output_format import OutputFormat +from grawlix.book import HtmlFiles, HtmlFile, OnlineFile, Book +from grawlix.exceptions import UnsupportedOutputFormat +from .output_format import OutputFormat, Update + +import asyncio +from bs4 import BeautifulSoup +import os +from ebooklib import epub class Epub(OutputFormat): extension = "epub" + + async def dl_html_files(self, book: Book, location: str, update: Update) -> None: + if not isinstance(book.data, HtmlFiles): + raise UnsupportedOutputFormat + html = book.data + output = epub.EpubBook() + output.set_title(book.metadata.title) + for author in book.metadata.authors: + output.add_author(author) + file_count = len(html.htmlfiles) + 1 # Html files + cover + + async def download_cover(cover_file: OnlineFile): + cover_filename = f"cover.{html.cover.extension}" + epub_cover = epub.EpubCover(file_name = cover_filename) + epub_cover.content = await self._download_file(html.cover) + output.add_item(epub_cover) + epub_cover_page = epub.EpubCoverHtml(image_name = cover_filename) + if update: + update(1/file_count) + return epub_cover_page + + + async def download_file(index: int, file: HtmlFile): + response = await self._client.get(file.file.url, follow_redirects=True) + soup = BeautifulSoup(response.text, "lxml") + selected_element = soup.find(attrs=file.selector) + epub_file = epub.EpubHtml( + title = file.title, + file_name = f"part {index}.html", + content = str(selected_element) + ) + if update: + update(1/file_count) + return epub_file + + # Download files + tasks = [ + download_file(index, file) + for index, file in enumerate(html.htmlfiles) + ] + epub_files = await asyncio.gather(download_cover(html.cover), *tasks) + + # Add files to epub + for epub_file in epub_files: + output.add_item(epub_file) + output.spine.append(epub_file) + output.toc.append(epub_file) + + # Complete book + output.add_item(epub.EpubNcx()) + output.add_item(epub.EpubNav()) + epub.write_epub(location, output) diff --git a/grawlix/output/output_format.py b/grawlix/output/output_format.py index 59f57fe..83406e6 100644 --- a/grawlix/output/output_format.py +++ b/grawlix/output/output_format.py @@ -1,4 +1,4 @@ -from grawlix.book import Book, SingleFile, OnlineFile, ImageList +from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book from grawlix.exceptions import UnsupportedOutputFormat from grawlix.encryption import decrypt @@ -20,7 +20,7 @@ class OutputFormat: await self._client.aclose() - async def dl_single_file(self, book: SingleFile, location: str, update_func: Update) -> None: + async def dl_single_file(self, book: Book, location: str, update_func: Update) -> None: """ Download and write an `grawlix.SingleFile` to disk @@ -28,12 +28,14 @@ class OutputFormat: :param location: Path to where the file is written :raises UnsupportedOutputFormat: If datatype is not supported by format """ - if not book.file.extension == self.extension: + if not isinstance(book.data, SingleFile): raise UnsupportedOutputFormat - await self._download_and_write_file(book.file, location, update_func) + if not book.data.file.extension == self.extension: + raise UnsupportedOutputFormat + await self._download_and_write_file(book.data.file, location, update_func) - async def dl_image_list(self, book: ImageList, location: str, update_func: Update) -> None: + async def dl_image_list(self, book: Book, location: str, update_func: Update) -> None: """ Download and write an `grawlix.ImageList` to disk @@ -44,6 +46,17 @@ class OutputFormat: raise UnsupportedOutputFormat + async def dl_html_files(self, book: Book, location: str, update_func: Update) -> None: + """ + Download and write a `grawlix.HtmlFiles` to disk + + :param book: Book to download + :param location: Path to where the file is written + :raises UnsupportedOutputFormat: If datatype is not supported by format + """ + raise UnsupportedOutputFormat + + async def _download_file(self, file: OnlineFile, update: Update = None) -> bytes: """ Download `grawlix.OnlineFile` @@ -53,7 +66,7 @@ class OutputFormat: :returns: Content of downloaded file """ content = b"" - async with self._client.stream("GET", file.url, headers = file.headers) as request: + async with self._client.stream("GET", file.url, headers = file.headers, follow_redirects=True) as request: total_filesize = int(request.headers["Content-length"]) async for chunk in request.aiter_bytes(): content += chunk diff --git a/grawlix/sources/__init__.py b/grawlix/sources/__init__.py index 7a42103..cd87be7 100644 --- a/grawlix/sources/__init__.py +++ b/grawlix/sources/__init__.py @@ -4,6 +4,7 @@ from .source import Source from .ereolen import Ereolen from .flipp import Flipp from .mangaplus import MangaPlus +from .royal_road import RoyalRoad from .saxo import Saxo from .webtoons import Webtoons @@ -53,6 +54,7 @@ def get_source_classes() -> list[type[Source]]: Ereolen, Flipp, MangaPlus, + RoyalRoad, Saxo, Webtoons ] diff --git a/grawlix/sources/royal_road.py b/grawlix/sources/royal_road.py new file mode 100644 index 0000000..3599b32 --- /dev/null +++ b/grawlix/sources/royal_road.py @@ -0,0 +1,45 @@ +from .source import Source +from grawlix.book import Book, HtmlFile, HtmlFiles, OnlineFile, Metadata + +from bs4 import BeautifulSoup + +class RoyalRoad(Source): + name: str = "Royal Road" + match = [ + r"https://www.royalroad.com/fiction/\d+/[^/]+" + ] + _authentication_methods: list[str] = [] + + + async def download(self, url: str) -> Book: + response = await self._client.get(url) + soup = BeautifulSoup(response.text, "lxml") + chapters = [] + for chapter in soup.find_all("tr", class_="chapter-row"): + chapters.append( + HtmlFile( + title = chapter.find("a").text.strip(), + file = OnlineFile( + url = f"https://royalroad.com{chapter.get('data-url')}", + extension = "html" + ), + selector = { "class": "chapter-content" } + ) + ) + return Book( + data = HtmlFiles( + cover = OnlineFile( + url = soup.find("div", class_="cover-art-container") \ + .find("img") \ + .get("src") \ + .replace("full", "large"), + extension = "jpg" + ), + htmlfiles = chapters + ), + metadata = Metadata( + title = soup.find("meta", attrs={"name":"twitter:title"}).get("content"), + authors = [ soup.find("meta", attrs={"name":"twitter:creator"}).get("content") ] + ), + overwrite = True + ) diff --git a/pyproject.toml b/pyproject.toml index e172ddf..b731394 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ dependencies = [ "appdirs", "beautifulsoup4", "blackboxprotobuf", + "EbookLib", "httpx", "importlib-resources", "lxml", diff --git a/shell.nix b/shell.nix index 1895ca4..83e4454 100644 --- a/shell.nix +++ b/shell.nix @@ -20,6 +20,18 @@ let doCheck = false; }; + ebooklib = python3Packages.buildPythonPackage rec { + pname = "EbookLib"; + version = "0.18"; + src = python3Packages.fetchPypi { + inherit pname version; + sha256 = "sha256-OFYmQ6e8lNm/VumTC0kn5Ok7XR0JF/aXpkVNtaHBpTM="; + }; + propagatedBuildInputs = with python3Packages; [ + six + lxml + ]; + }; in mkShell { buildInputs = [ @@ -27,6 +39,7 @@ mkShell { appdirs beautifulsoup4 blackboxprotobuf + ebooklib httpx importlib-resources lxml