From 9e876a0cf670f9890860294b7bfc3a77b08b3a9c Mon Sep 17 00:00:00 2001 From: Joakim Holm Date: Mon, 12 Jun 2023 22:36:54 +0200 Subject: [PATCH] Add basic support for Nextory --- README.md | 1 + grawlix/book.py | 11 ++ grawlix/output/__init__.py | 6 +- grawlix/output/epub.py | 66 ++++++++++- grawlix/sources/__init__.py | 2 + grawlix/sources/nextory.py | 227 ++++++++++++++++++++++++++++++++++++ 6 files changed, 308 insertions(+), 5 deletions(-) create mode 100644 grawlix/sources/nextory.py diff --git a/README.md b/README.md index a601e3a..2dff4d1 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ grawlix currently supports downloading from the following sources: - [Flipp](https://flipp.dk) - [Internet Archive](https://archive.org) - [Manga Plus](https://mangaplus.shueisha.co.jp) +- [Nextory](https://nextory.com) - [Royal Road](https://www.royalroad.com) - [Saxo](https://saxo.com) - [Webtoons](https://webtoons.com) diff --git a/grawlix/book.py b/grawlix/book.py index ae49564..77e92ec 100644 --- a/grawlix/book.py +++ b/grawlix/book.py @@ -55,6 +55,16 @@ class ImageList: """ images: list[OnlineFile] + +@dataclass(slots=True) +class EpubInParts: + """ + Epub split up into smaller epubs + """ + files: list[OnlineFile] + files_in_toc: dict[str, str] + + @dataclass(slots=True) class HtmlFile: title: str @@ -67,6 +77,7 @@ class HtmlFiles: htmlfiles: list[HtmlFile] BookData = Union[ + EpubInParts, SingleFile, ImageList, HtmlFiles diff --git a/grawlix/output/__init__.py b/grawlix/output/__init__.py index 35eb1d2..7243327 100644 --- a/grawlix/output/__init__.py +++ b/grawlix/output/__init__.py @@ -1,4 +1,4 @@ -from grawlix.book import Book, BookData, SingleFile, ImageList, OnlineFile, HtmlFiles +from grawlix.book import Book, BookData, SingleFile, ImageList, OnlineFile, HtmlFiles, EpubInParts from grawlix.exceptions import GrawlixError, UnsupportedOutputFormat from grawlix.logging import info @@ -58,9 +58,9 @@ def get_default_format(book: Book) -> OutputFormat: bookdata = book.data if isinstance(bookdata, SingleFile): extension = bookdata.file.extension - if isinstance(bookdata, ImageList): + elif isinstance(bookdata, ImageList): extension = "cbz" - if isinstance(bookdata, HtmlFiles): + elif isinstance(bookdata, HtmlFiles) or isinstance(bookdata, EpubInParts): extension = "epub" output_format = find_output_format(book, extension) return output_format() diff --git a/grawlix/output/epub.py b/grawlix/output/epub.py index 7a5d4fa..7850805 100644 --- a/grawlix/output/epub.py +++ b/grawlix/output/epub.py @@ -1,4 +1,4 @@ -from grawlix.book import HtmlFiles, HtmlFile, OnlineFile, Book, SingleFile, Metadata +from grawlix.book import HtmlFiles, HtmlFile, OnlineFile, Book, SingleFile, Metadata, EpubInParts from grawlix.exceptions import UnsupportedOutputFormat from .output_format import OutputFormat, Update @@ -6,19 +6,25 @@ import asyncio from bs4 import BeautifulSoup import os from ebooklib import epub +from zipfile import ZipFile +import rich class Epub(OutputFormat): extension = "epub" - input_types = [SingleFile, HtmlFiles] + input_types = [SingleFile, HtmlFiles, EpubInParts] + async def download(self, book: Book, location: str, update: Update) -> None: if isinstance(book.data, SingleFile): await self._download_single_file(book, location, update) elif isinstance(book.data, HtmlFiles): await self._download_html_files(book.data, book.metadata, location, update) + elif isinstance(book.data, EpubInParts): + await self._download_epub_in_parts(book.data, book.metadata, location, update) else: raise UnsupportedOutputFormat + async def _download_html_files(self, html: HtmlFiles, metadata: Metadata, location: str, update: Update) -> None: output = epub.EpubBook() output.set_title(metadata.title) @@ -67,3 +73,59 @@ class Epub(OutputFormat): output.add_item(epub.EpubNcx()) output.add_item(epub.EpubNav()) epub.write_epub(location, output) + + + async def _download_epub_in_parts(self, data: EpubInParts, metadata: Metadata, location: str, update: Update) -> None: + files = data.files + file_count = len(files) + progress = 1/(file_count) + temporary_file_location = f"{location}.tmp" + + added_files: set[str] = set() + def get_new_files(zipfile: ZipFile): + """Returns files in zipfile not already added to file""" + for filename in zipfile.namelist(): + if filename in added_files or filename.endswith(".opf") or filename.endswith(".ncx"): + continue + yield filename + + output = epub.EpubBook() + for file in files: + await self._download_and_write_file(file, temporary_file_location) + with ZipFile(temporary_file_location, "r") as zipfile: + for filepath in get_new_files(zipfile): + content = zipfile.read(filepath) + if filepath.endswith("html"): + filename = os.path.basename(filepath) + is_in_toc = False + title = None + for key, value in data.files_in_toc.items(): + toc_filename = key.split("#")[0] + if filename == toc_filename: + title = value + is_in_toc = True + break + epub_file = epub.EpubHtml( + title = title, + file_name = filepath, + content = content + ) + output.add_item(epub_file) + output.spine.append(epub_file) + if is_in_toc: + output.toc.append(epub_file) + else: + epub_file = epub.EpubItem( + file_name = filepath, + content = content + ) + output.add_item(epub_file) + added_files.add(filepath) + if update: + update(progress) + os.remove(temporary_file_location) + + output.add_item(epub.EpubNcx()) + output.add_item(epub.EpubNav()) + epub.write_epub(location, output) + exit() diff --git a/grawlix/sources/__init__.py b/grawlix/sources/__init__.py index 7e1fbf4..01ec768 100644 --- a/grawlix/sources/__init__.py +++ b/grawlix/sources/__init__.py @@ -5,6 +5,7 @@ from .ereolen import Ereolen from .flipp import Flipp from .internet_archive import InternetArchive from .mangaplus import MangaPlus +from .nextory import Nextory from .royal_road import RoyalRoad from .saxo import Saxo from .webtoons import Webtoons @@ -56,6 +57,7 @@ def get_source_classes() -> list[type[Source]]: Flipp, InternetArchive, MangaPlus, + Nextory, RoyalRoad, Saxo, Webtoons diff --git a/grawlix/sources/nextory.py b/grawlix/sources/nextory.py new file mode 100644 index 0000000..5356641 --- /dev/null +++ b/grawlix/sources/nextory.py @@ -0,0 +1,227 @@ +from grawlix.book import Book, Metadata, OnlineFile, BookData, OnlineFile, SingleFile, EpubInParts, Result, Series +from grawlix.encryption import AESEncryption +from grawlix.exceptions import InvalidUrl +from .source import Source + +from typing import Optional +import uuid +import rich +import base64 + +LOCALE = "en_GB" + +class Nextory(Source): + name: str = "Nextory" + match = [ + r"https?://((www|catalog-\w\w).)?nextory.+" + ] + _authentication_methods = [ "login" ] + + + @staticmethod + def _create_device_id() -> str: + """Create unique device id""" + return str(uuid.uuid3(uuid.NAMESPACE_DNS, "audiobook-dl")) + + + async def login(self, username: str, password: str, **kwargs) -> None: + # Set permanent headers + device_id = self._create_device_id() + self._client.headers.update( + { + "X-Application-Id": "200", + "X-App-Version": "5.0.0", + "X-Locale": LOCALE, + "X-Model": "Personal Computer", + "X-Device-Id": device_id, + "locale": LOCALE, + "device": device_id, + "osinfo": "Android 13", + "model": "Personal Computer", + "version": "4.34.6", + "appid": "200", + } + ) + # Login for account + session_response = await self._client.post( + "https://api.nextory.com/user/v1/sessions", + json = { + "identifier": username, + "password": password + }, + ) + session_response = session_response.json() + rich.print(session_response) + login_token = session_response["login_token"] + country = session_response["country"] + self._client.headers.update( + { + "token": login_token, + "X-Login-Token": login_token, + "X-Country-Code": country, + } + ) + # Login for user + profiles_response = await self._client.get( + "https://api.nextory.com/user/v1/me/profiles", + ) + profiles_response = profiles_response.json() + rich.print(profiles_response) + profile = profiles_response["profiles"][0] + login_key = profile["login_key"] + authorize_response = await self._client.post( + "https://api.nextory.com/user/v1/profile/authorize", + json = { + "login_key": login_key + } + ) + authorize_response = authorize_response.json() + rich.print(authorize_response) + profile_token = authorize_response["profile_token"] + self._client.headers.update({"X-Profile-Token": profile_token}) + self._client.headers.update({"X-Profile-Token": profile_token}) + + + @staticmethod + def _find_epub_id(product_data) -> str: + """Find id of book format of type epub for given book""" + for format in product_data["formats"]: + if format["type"] == "epub": + return format["identifier"] + raise InvalidUrl + + + @staticmethod + def _extract_id_from_url(url: str) -> str: + """ + Extract id of book from url. This id is not always the internal id for + the book. + + :param url: Url to book information page + :return: Id in url + """ + return url.split("-")[-1].replace("/", "") + + + async def download(self, url: str) -> Result: + url_id = self._extract_id_from_url(url) + if "serier" in url: + return await self._download_series(url_id) + else: + book_id = await self._get_book_id_from_url_id(url_id) + return await self._download_book(book_id) + + + async def download_book_from_id(self, book_id: str) -> Book: + return await self._download_book(book_id) + + + async def _download_series(self, series_id: str) -> Series: + """ + Download series from Nextory + + :param series_id: Id of series on Nextory + :returns: Series data + """ + response = await self._client.get( + f"https://api.nextory.com/discovery/v1/series/{series_id}/products", + params = { + "content_type": "book", + "page": 0, + "per": 100, + } + ) + series_data = response.json() + book_ids = [] + for book in series_data["products"]: + book_id = book["id"] + book_ids.append(book_id) + return Series( + title = series_data["products"][0]["series"]["name"], + book_ids = book_ids, + ) + + + @staticmethod + def _extract_series_name(product_info: dict) -> Optional[str]: + if not "series" in product_info: + return None + return product_info["series"]["name"] + + + async def _get_book_id_from_url_id(self, url_id: str) -> str: + """ + Download book id from url id + + :param url_id: Id of book from url + :return: Book id + """ + response = await self._client.get( + f"https://api.nextory.se/api/app/product/7.5/bookinfo", + params = { "id": url_id }, + ) + rich.print(response.url) + rich.print(response.content) + exit() + + + async def _download_book(self, book_id: str) -> Book: + product_data = await self._client.get( + f"https://api.nextory.com/library/v1/products/{book_id}" + ) + product_data = product_data.json() + epub_id = self._find_epub_id(product_data) + pages = await self._get_pages(epub_id) + return Book( + data = pages, + metadata = Metadata( + title = product_data["title"], + authors = [author["name"] for author in product_data["authors"]], + series = self._extract_series_name(product_data), + ) + ) + + + @staticmethod + def _fix_key(value: str) -> bytes: + """Remove unused data and decode key""" + return base64.b64decode(value[:-1]) + + + async def _get_pages(self, epub_id: str) -> BookData: + """ + Download page information for book + + :param epub_id: Id of epub file + :return: Page data + """ + # Nextory books are for some reason split up into multiple epub files - + # one for each chapter file. All of these files has to be decrypted and + # combined afterwards. Many of the provided epub files contain the same + # files and some of them contain the same file names but with variation + # in the content and comments that describe what should have been there + # if the book was whole from the start. + response = await self._client.get( + f"https://api.nextory.com/reader/books/{epub_id}/packages/epub" + ) + epub_data = response.json() + encryption = AESEncryption( + key = self._fix_key(epub_data["crypt_key"]), + iv = self._fix_key(epub_data["crypt_iv"]) + ) + files = [] + for part in epub_data["spines"]: + files.append( + OnlineFile( + url = part["spine_url"], + extension = "epub", + encryption = encryption + ) + ) + files_in_toc = {} + for item in epub_data["toc"]["childrens"]: # Why is it "childrens"? + files_in_toc[item["src"]] = item["name"] + return EpubInParts( + files, + files_in_toc + )