Implement Royal Road Source

This commit is contained in:
Joakim Holm 2023-04-27 23:04:14 +02:00
parent ce61b70d00
commit 1c3f1c4245
9 changed files with 171 additions and 18 deletions

View File

@ -46,9 +46,21 @@ class ImageList:
"""
images: list[OnlineFile]
@dataclass(slots=True)
class HtmlFile:
title: str
file: OnlineFile
selector: Optional[dict[str, str]]
@dataclass(slots=True)
class HtmlFiles:
cover: OnlineFile
htmlfiles: list[HtmlFile]
BookData = Union[
SingleFile,
ImageList
ImageList,
HtmlFiles
]
@dataclass(slots=True)
@ -56,6 +68,7 @@ class Book:
"""Stores information about a book"""
metadata: Metadata
data: BookData
overwrite: bool = False
T = TypeVar("T")

View File

@ -1,10 +1,10 @@
from grawlix.book import Book, BookData, SingleFile, ImageList, OnlineFile
from grawlix.book import Book, BookData, SingleFile, ImageList, OnlineFile, HtmlFiles
from grawlix.exceptions import GrawlixError
from grawlix.logging import info
from .output_format import OutputFormat
from .epub import Epub
from .cbz import Cbz
from .epub import Epub
from typing import Callable
from pathlib import Path
@ -18,16 +18,18 @@ async def download_book(book: Book, update_func: Callable, template: str) -> Non
"""
output_format = get_default_format(book.data)
location = format_output_location(book, output_format, template)
if os.path.exists(location):
if not book.overwrite and os.path.exists(location):
info("Skipping - File already exists")
return
parent = Path(location).parent
if not parent.exists():
os.makedirs(parent)
if isinstance(book.data, SingleFile):
await output_format.dl_single_file(book.data, location, update_func)
await output_format.dl_single_file(book, location, update_func)
elif isinstance(book.data, ImageList):
await output_format.dl_image_list(book.data, location, update_func)
await output_format.dl_image_list(book, location, update_func)
elif isinstance(book.data, HtmlFiles):
await output_format.dl_html_files(book, location, update_func)
else:
raise NotImplementedError
await output_format.close()
@ -58,6 +60,8 @@ def get_default_format(bookdata: BookData) -> OutputFormat:
return output_format_from_str(bookdata.file.extension)
if isinstance(bookdata, ImageList):
return Cbz()
if isinstance(bookdata, HtmlFiles):
return Epub()
raise GrawlixError

View File

@ -1,5 +1,6 @@
from .output_format import OutputFormat, Update
from .output_format import OutputFormat, Update, Book
from grawlix.book import ImageList, OnlineFile
from grawlix.exceptions import UnsupportedOutputFormat
from zipfile import ZipFile
import asyncio
@ -9,8 +10,11 @@ class Cbz(OutputFormat):
extension: str = "cbz"
async def dl_image_list(self, book: ImageList, location: str, update: Update) -> None:
image_count = len(book.images)
async def dl_image_list(self, book: Book, location: str, update: Update) -> None:
if not isinstance(book.data, ImageList):
raise UnsupportedOutputFormat
images = book.data.images
image_count = len(images)
with ZipFile(location, mode="w") as zip:
async def download_page(index: int, file: OnlineFile):
content = await self._download_file(file)
@ -19,6 +23,6 @@ class Cbz(OutputFormat):
update(1/image_count)
tasks = [
asyncio.create_task(download_page(index, file))
for index, file in enumerate(book.images)
for index, file in enumerate(images)
]
await asyncio.wait(tasks)

View File

@ -1,5 +1,63 @@
from grawlix.book import Book, SingleFile
from .output_format import OutputFormat
from grawlix.book import HtmlFiles, HtmlFile, OnlineFile, Book
from grawlix.exceptions import UnsupportedOutputFormat
from .output_format import OutputFormat, Update
import asyncio
from bs4 import BeautifulSoup
import os
from ebooklib import epub
class Epub(OutputFormat):
extension = "epub"
async def dl_html_files(self, book: Book, location: str, update: Update) -> None:
if not isinstance(book.data, HtmlFiles):
raise UnsupportedOutputFormat
html = book.data
output = epub.EpubBook()
output.set_title(book.metadata.title)
for author in book.metadata.authors:
output.add_author(author)
file_count = len(html.htmlfiles) + 1 # Html files + cover
async def download_cover(cover_file: OnlineFile):
cover_filename = f"cover.{html.cover.extension}"
epub_cover = epub.EpubCover(file_name = cover_filename)
epub_cover.content = await self._download_file(html.cover)
output.add_item(epub_cover)
epub_cover_page = epub.EpubCoverHtml(image_name = cover_filename)
if update:
update(1/file_count)
return epub_cover_page
async def download_file(index: int, file: HtmlFile):
response = await self._client.get(file.file.url, follow_redirects=True)
soup = BeautifulSoup(response.text, "lxml")
selected_element = soup.find(attrs=file.selector)
epub_file = epub.EpubHtml(
title = file.title,
file_name = f"part {index}.html",
content = str(selected_element)
)
if update:
update(1/file_count)
return epub_file
# Download files
tasks = [
download_file(index, file)
for index, file in enumerate(html.htmlfiles)
]
epub_files = await asyncio.gather(download_cover(html.cover), *tasks)
# Add files to epub
for epub_file in epub_files:
output.add_item(epub_file)
output.spine.append(epub_file)
output.toc.append(epub_file)
# Complete book
output.add_item(epub.EpubNcx())
output.add_item(epub.EpubNav())
epub.write_epub(location, output)

View File

@ -1,4 +1,4 @@
from grawlix.book import Book, SingleFile, OnlineFile, ImageList
from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book
from grawlix.exceptions import UnsupportedOutputFormat
from grawlix.encryption import decrypt
@ -20,7 +20,7 @@ class OutputFormat:
await self._client.aclose()
async def dl_single_file(self, book: SingleFile, location: str, update_func: Update) -> None:
async def dl_single_file(self, book: Book, location: str, update_func: Update) -> None:
"""
Download and write an `grawlix.SingleFile` to disk
@ -28,12 +28,14 @@ class OutputFormat:
:param location: Path to where the file is written
:raises UnsupportedOutputFormat: If datatype is not supported by format
"""
if not book.file.extension == self.extension:
if not isinstance(book.data, SingleFile):
raise UnsupportedOutputFormat
await self._download_and_write_file(book.file, location, update_func)
if not book.data.file.extension == self.extension:
raise UnsupportedOutputFormat
await self._download_and_write_file(book.data.file, location, update_func)
async def dl_image_list(self, book: ImageList, location: str, update_func: Update) -> None:
async def dl_image_list(self, book: Book, location: str, update_func: Update) -> None:
"""
Download and write an `grawlix.ImageList` to disk
@ -44,6 +46,17 @@ class OutputFormat:
raise UnsupportedOutputFormat
async def dl_html_files(self, book: Book, location: str, update_func: Update) -> None:
"""
Download and write a `grawlix.HtmlFiles` to disk
:param book: Book to download
:param location: Path to where the file is written
:raises UnsupportedOutputFormat: If datatype is not supported by format
"""
raise UnsupportedOutputFormat
async def _download_file(self, file: OnlineFile, update: Update = None) -> bytes:
"""
Download `grawlix.OnlineFile`
@ -53,7 +66,7 @@ class OutputFormat:
:returns: Content of downloaded file
"""
content = b""
async with self._client.stream("GET", file.url, headers = file.headers) as request:
async with self._client.stream("GET", file.url, headers = file.headers, follow_redirects=True) as request:
total_filesize = int(request.headers["Content-length"])
async for chunk in request.aiter_bytes():
content += chunk

View File

@ -4,6 +4,7 @@ from .source import Source
from .ereolen import Ereolen
from .flipp import Flipp
from .mangaplus import MangaPlus
from .royal_road import RoyalRoad
from .saxo import Saxo
from .webtoons import Webtoons
@ -53,6 +54,7 @@ def get_source_classes() -> list[type[Source]]:
Ereolen,
Flipp,
MangaPlus,
RoyalRoad,
Saxo,
Webtoons
]

View File

@ -0,0 +1,45 @@
from .source import Source
from grawlix.book import Book, HtmlFile, HtmlFiles, OnlineFile, Metadata
from bs4 import BeautifulSoup
class RoyalRoad(Source):
name: str = "Royal Road"
match = [
r"https://www.royalroad.com/fiction/\d+/[^/]+"
]
_authentication_methods: list[str] = []
async def download(self, url: str) -> Book:
response = await self._client.get(url)
soup = BeautifulSoup(response.text, "lxml")
chapters = []
for chapter in soup.find_all("tr", class_="chapter-row"):
chapters.append(
HtmlFile(
title = chapter.find("a").text.strip(),
file = OnlineFile(
url = f"https://royalroad.com{chapter.get('data-url')}",
extension = "html"
),
selector = { "class": "chapter-content" }
)
)
return Book(
data = HtmlFiles(
cover = OnlineFile(
url = soup.find("div", class_="cover-art-container") \
.find("img") \
.get("src") \
.replace("full", "large"),
extension = "jpg"
),
htmlfiles = chapters
),
metadata = Metadata(
title = soup.find("meta", attrs={"name":"twitter:title"}).get("content"),
authors = [ soup.find("meta", attrs={"name":"twitter:creator"}).get("content") ]
),
overwrite = True
)

View File

@ -14,6 +14,7 @@ dependencies = [
"appdirs",
"beautifulsoup4",
"blackboxprotobuf",
"EbookLib",
"httpx",
"importlib-resources",
"lxml",

View File

@ -20,6 +20,18 @@ let
doCheck = false;
};
ebooklib = python3Packages.buildPythonPackage rec {
pname = "EbookLib";
version = "0.18";
src = python3Packages.fetchPypi {
inherit pname version;
sha256 = "sha256-OFYmQ6e8lNm/VumTC0kn5Ok7XR0JF/aXpkVNtaHBpTM=";
};
propagatedBuildInputs = with python3Packages; [
six
lxml
];
};
in
mkShell {
buildInputs = [
@ -27,6 +39,7 @@ mkShell {
appdirs
beautifulsoup4
blackboxprotobuf
ebooklib
httpx
importlib-resources
lxml