Implement Royal Road Source

This commit is contained in:
Joakim Holm 2023-04-27 23:04:14 +02:00
parent ce61b70d00
commit 1c3f1c4245
9 changed files with 171 additions and 18 deletions

View File

@ -46,9 +46,21 @@ class ImageList:
""" """
images: list[OnlineFile] images: list[OnlineFile]
@dataclass(slots=True)
class HtmlFile:
title: str
file: OnlineFile
selector: Optional[dict[str, str]]
@dataclass(slots=True)
class HtmlFiles:
cover: OnlineFile
htmlfiles: list[HtmlFile]
BookData = Union[ BookData = Union[
SingleFile, SingleFile,
ImageList ImageList,
HtmlFiles
] ]
@dataclass(slots=True) @dataclass(slots=True)
@ -56,6 +68,7 @@ class Book:
"""Stores information about a book""" """Stores information about a book"""
metadata: Metadata metadata: Metadata
data: BookData data: BookData
overwrite: bool = False
T = TypeVar("T") T = TypeVar("T")

View File

@ -1,10 +1,10 @@
from grawlix.book import Book, BookData, SingleFile, ImageList, OnlineFile from grawlix.book import Book, BookData, SingleFile, ImageList, OnlineFile, HtmlFiles
from grawlix.exceptions import GrawlixError from grawlix.exceptions import GrawlixError
from grawlix.logging import info from grawlix.logging import info
from .output_format import OutputFormat from .output_format import OutputFormat
from .epub import Epub
from .cbz import Cbz from .cbz import Cbz
from .epub import Epub
from typing import Callable from typing import Callable
from pathlib import Path from pathlib import Path
@ -18,16 +18,18 @@ async def download_book(book: Book, update_func: Callable, template: str) -> Non
""" """
output_format = get_default_format(book.data) output_format = get_default_format(book.data)
location = format_output_location(book, output_format, template) location = format_output_location(book, output_format, template)
if os.path.exists(location): if not book.overwrite and os.path.exists(location):
info("Skipping - File already exists") info("Skipping - File already exists")
return return
parent = Path(location).parent parent = Path(location).parent
if not parent.exists(): if not parent.exists():
os.makedirs(parent) os.makedirs(parent)
if isinstance(book.data, SingleFile): if isinstance(book.data, SingleFile):
await output_format.dl_single_file(book.data, location, update_func) await output_format.dl_single_file(book, location, update_func)
elif isinstance(book.data, ImageList): elif isinstance(book.data, ImageList):
await output_format.dl_image_list(book.data, location, update_func) await output_format.dl_image_list(book, location, update_func)
elif isinstance(book.data, HtmlFiles):
await output_format.dl_html_files(book, location, update_func)
else: else:
raise NotImplementedError raise NotImplementedError
await output_format.close() await output_format.close()
@ -58,6 +60,8 @@ def get_default_format(bookdata: BookData) -> OutputFormat:
return output_format_from_str(bookdata.file.extension) return output_format_from_str(bookdata.file.extension)
if isinstance(bookdata, ImageList): if isinstance(bookdata, ImageList):
return Cbz() return Cbz()
if isinstance(bookdata, HtmlFiles):
return Epub()
raise GrawlixError raise GrawlixError

View File

@ -1,5 +1,6 @@
from .output_format import OutputFormat, Update from .output_format import OutputFormat, Update, Book
from grawlix.book import ImageList, OnlineFile from grawlix.book import ImageList, OnlineFile
from grawlix.exceptions import UnsupportedOutputFormat
from zipfile import ZipFile from zipfile import ZipFile
import asyncio import asyncio
@ -9,8 +10,11 @@ class Cbz(OutputFormat):
extension: str = "cbz" extension: str = "cbz"
async def dl_image_list(self, book: ImageList, location: str, update: Update) -> None: async def dl_image_list(self, book: Book, location: str, update: Update) -> None:
image_count = len(book.images) if not isinstance(book.data, ImageList):
raise UnsupportedOutputFormat
images = book.data.images
image_count = len(images)
with ZipFile(location, mode="w") as zip: with ZipFile(location, mode="w") as zip:
async def download_page(index: int, file: OnlineFile): async def download_page(index: int, file: OnlineFile):
content = await self._download_file(file) content = await self._download_file(file)
@ -19,6 +23,6 @@ class Cbz(OutputFormat):
update(1/image_count) update(1/image_count)
tasks = [ tasks = [
asyncio.create_task(download_page(index, file)) asyncio.create_task(download_page(index, file))
for index, file in enumerate(book.images) for index, file in enumerate(images)
] ]
await asyncio.wait(tasks) await asyncio.wait(tasks)

View File

@ -1,5 +1,63 @@
from grawlix.book import Book, SingleFile from grawlix.book import HtmlFiles, HtmlFile, OnlineFile, Book
from .output_format import OutputFormat from grawlix.exceptions import UnsupportedOutputFormat
from .output_format import OutputFormat, Update
import asyncio
from bs4 import BeautifulSoup
import os
from ebooklib import epub
class Epub(OutputFormat): class Epub(OutputFormat):
extension = "epub" extension = "epub"
async def dl_html_files(self, book: Book, location: str, update: Update) -> None:
if not isinstance(book.data, HtmlFiles):
raise UnsupportedOutputFormat
html = book.data
output = epub.EpubBook()
output.set_title(book.metadata.title)
for author in book.metadata.authors:
output.add_author(author)
file_count = len(html.htmlfiles) + 1 # Html files + cover
async def download_cover(cover_file: OnlineFile):
cover_filename = f"cover.{html.cover.extension}"
epub_cover = epub.EpubCover(file_name = cover_filename)
epub_cover.content = await self._download_file(html.cover)
output.add_item(epub_cover)
epub_cover_page = epub.EpubCoverHtml(image_name = cover_filename)
if update:
update(1/file_count)
return epub_cover_page
async def download_file(index: int, file: HtmlFile):
response = await self._client.get(file.file.url, follow_redirects=True)
soup = BeautifulSoup(response.text, "lxml")
selected_element = soup.find(attrs=file.selector)
epub_file = epub.EpubHtml(
title = file.title,
file_name = f"part {index}.html",
content = str(selected_element)
)
if update:
update(1/file_count)
return epub_file
# Download files
tasks = [
download_file(index, file)
for index, file in enumerate(html.htmlfiles)
]
epub_files = await asyncio.gather(download_cover(html.cover), *tasks)
# Add files to epub
for epub_file in epub_files:
output.add_item(epub_file)
output.spine.append(epub_file)
output.toc.append(epub_file)
# Complete book
output.add_item(epub.EpubNcx())
output.add_item(epub.EpubNav())
epub.write_epub(location, output)

View File

@ -1,4 +1,4 @@
from grawlix.book import Book, SingleFile, OnlineFile, ImageList from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book
from grawlix.exceptions import UnsupportedOutputFormat from grawlix.exceptions import UnsupportedOutputFormat
from grawlix.encryption import decrypt from grawlix.encryption import decrypt
@ -20,7 +20,7 @@ class OutputFormat:
await self._client.aclose() await self._client.aclose()
async def dl_single_file(self, book: SingleFile, location: str, update_func: Update) -> None: async def dl_single_file(self, book: Book, location: str, update_func: Update) -> None:
""" """
Download and write an `grawlix.SingleFile` to disk Download and write an `grawlix.SingleFile` to disk
@ -28,12 +28,14 @@ class OutputFormat:
:param location: Path to where the file is written :param location: Path to where the file is written
:raises UnsupportedOutputFormat: If datatype is not supported by format :raises UnsupportedOutputFormat: If datatype is not supported by format
""" """
if not book.file.extension == self.extension: if not isinstance(book.data, SingleFile):
raise UnsupportedOutputFormat raise UnsupportedOutputFormat
await self._download_and_write_file(book.file, location, update_func) if not book.data.file.extension == self.extension:
raise UnsupportedOutputFormat
await self._download_and_write_file(book.data.file, location, update_func)
async def dl_image_list(self, book: ImageList, location: str, update_func: Update) -> None: async def dl_image_list(self, book: Book, location: str, update_func: Update) -> None:
""" """
Download and write an `grawlix.ImageList` to disk Download and write an `grawlix.ImageList` to disk
@ -44,6 +46,17 @@ class OutputFormat:
raise UnsupportedOutputFormat raise UnsupportedOutputFormat
async def dl_html_files(self, book: Book, location: str, update_func: Update) -> None:
"""
Download and write a `grawlix.HtmlFiles` to disk
:param book: Book to download
:param location: Path to where the file is written
:raises UnsupportedOutputFormat: If datatype is not supported by format
"""
raise UnsupportedOutputFormat
async def _download_file(self, file: OnlineFile, update: Update = None) -> bytes: async def _download_file(self, file: OnlineFile, update: Update = None) -> bytes:
""" """
Download `grawlix.OnlineFile` Download `grawlix.OnlineFile`
@ -53,7 +66,7 @@ class OutputFormat:
:returns: Content of downloaded file :returns: Content of downloaded file
""" """
content = b"" content = b""
async with self._client.stream("GET", file.url, headers = file.headers) as request: async with self._client.stream("GET", file.url, headers = file.headers, follow_redirects=True) as request:
total_filesize = int(request.headers["Content-length"]) total_filesize = int(request.headers["Content-length"])
async for chunk in request.aiter_bytes(): async for chunk in request.aiter_bytes():
content += chunk content += chunk

View File

@ -4,6 +4,7 @@ from .source import Source
from .ereolen import Ereolen from .ereolen import Ereolen
from .flipp import Flipp from .flipp import Flipp
from .mangaplus import MangaPlus from .mangaplus import MangaPlus
from .royal_road import RoyalRoad
from .saxo import Saxo from .saxo import Saxo
from .webtoons import Webtoons from .webtoons import Webtoons
@ -53,6 +54,7 @@ def get_source_classes() -> list[type[Source]]:
Ereolen, Ereolen,
Flipp, Flipp,
MangaPlus, MangaPlus,
RoyalRoad,
Saxo, Saxo,
Webtoons Webtoons
] ]

View File

@ -0,0 +1,45 @@
from .source import Source
from grawlix.book import Book, HtmlFile, HtmlFiles, OnlineFile, Metadata
from bs4 import BeautifulSoup
class RoyalRoad(Source):
name: str = "Royal Road"
match = [
r"https://www.royalroad.com/fiction/\d+/[^/]+"
]
_authentication_methods: list[str] = []
async def download(self, url: str) -> Book:
response = await self._client.get(url)
soup = BeautifulSoup(response.text, "lxml")
chapters = []
for chapter in soup.find_all("tr", class_="chapter-row"):
chapters.append(
HtmlFile(
title = chapter.find("a").text.strip(),
file = OnlineFile(
url = f"https://royalroad.com{chapter.get('data-url')}",
extension = "html"
),
selector = { "class": "chapter-content" }
)
)
return Book(
data = HtmlFiles(
cover = OnlineFile(
url = soup.find("div", class_="cover-art-container") \
.find("img") \
.get("src") \
.replace("full", "large"),
extension = "jpg"
),
htmlfiles = chapters
),
metadata = Metadata(
title = soup.find("meta", attrs={"name":"twitter:title"}).get("content"),
authors = [ soup.find("meta", attrs={"name":"twitter:creator"}).get("content") ]
),
overwrite = True
)

View File

@ -14,6 +14,7 @@ dependencies = [
"appdirs", "appdirs",
"beautifulsoup4", "beautifulsoup4",
"blackboxprotobuf", "blackboxprotobuf",
"EbookLib",
"httpx", "httpx",
"importlib-resources", "importlib-resources",
"lxml", "lxml",

View File

@ -20,6 +20,18 @@ let
doCheck = false; doCheck = false;
}; };
ebooklib = python3Packages.buildPythonPackage rec {
pname = "EbookLib";
version = "0.18";
src = python3Packages.fetchPypi {
inherit pname version;
sha256 = "sha256-OFYmQ6e8lNm/VumTC0kn5Ok7XR0JF/aXpkVNtaHBpTM=";
};
propagatedBuildInputs = with python3Packages; [
six
lxml
];
};
in in
mkShell { mkShell {
buildInputs = [ buildInputs = [
@ -27,6 +39,7 @@ mkShell {
appdirs appdirs
beautifulsoup4 beautifulsoup4
blackboxprotobuf blackboxprotobuf
ebooklib
httpx httpx
importlib-resources importlib-resources
lxml lxml