Add basic support for Nextory

2025-12-16 04:09:10 +00:00 · 2023-06-12 22:36:54 +02:00 · 2023-06-12 22:36:54 +02:00 · 9e876a0cf6
commit 9e876a0cf6
parent f91a32b0ab
6 changed files with 308 additions and 5 deletions
--- a/README.md
+++ b/README.md
@ -12,6 +12,7 @@ grawlix currently supports downloading from the following sources:
 - [Flipp](https://flipp.dk)
 - [Internet Archive](https://archive.org)
 - [Manga Plus](https://mangaplus.shueisha.co.jp)
+- [Nextory](https://nextory.com)
 - [Royal Road](https://www.royalroad.com)
 - [Saxo](https://saxo.com)
 - [Webtoons](https://webtoons.com)
--- a/grawlix/book.py
+++ b/grawlix/book.py
@ -55,6 +55,16 @@ class ImageList:
    """
    images: list[OnlineFile]

+
+@dataclass(slots=True)
+class EpubInParts:
+    """
+    Epub split up into smaller epubs
+    """
+    files: list[OnlineFile]
+    files_in_toc: dict[str, str]
+
+
@dataclass(slots=True)
 class HtmlFile:
    title: str
@ -67,6 +77,7 @@ class HtmlFiles:
    htmlfiles: list[HtmlFile]

 BookData = Union[
+    EpubInParts,
    SingleFile,
    ImageList,
    HtmlFiles
--- a/grawlix/output/init.py
+++ b/grawlix/output/init.py
@ -1,4 +1,4 @@
-from grawlix.book import Book, BookData, SingleFile, ImageList, OnlineFile, HtmlFiles
+from grawlix.book import Book, BookData, SingleFile, ImageList, OnlineFile, HtmlFiles, EpubInParts
 from grawlix.exceptions import GrawlixError, UnsupportedOutputFormat
 from grawlix.logging import info

@ -58,9 +58,9 @@ def get_default_format(book: Book) -> OutputFormat:
    bookdata = book.data
    if isinstance(bookdata, SingleFile):
        extension = bookdata.file.extension
-    if isinstance(bookdata, ImageList):
+    elif isinstance(bookdata, ImageList):
        extension = "cbz"
-    if isinstance(bookdata, HtmlFiles):
+    elif isinstance(bookdata, HtmlFiles) or isinstance(bookdata, EpubInParts):
        extension = "epub"
    output_format = find_output_format(book, extension)
    return output_format()
--- a/grawlix/output/epub.py
+++ b/grawlix/output/epub.py
@ -1,4 +1,4 @@
-from grawlix.book import HtmlFiles, HtmlFile, OnlineFile, Book, SingleFile, Metadata
+from grawlix.book import HtmlFiles, HtmlFile, OnlineFile, Book, SingleFile, Metadata, EpubInParts
 from grawlix.exceptions import UnsupportedOutputFormat
 from .output_format import OutputFormat, Update

@ -6,19 +6,25 @@ import asyncio
 from bs4 import BeautifulSoup
 import os
 from ebooklib import epub
+from zipfile import ZipFile
+import rich

 class Epub(OutputFormat):
    extension = "epub"
-    input_types = [SingleFile, HtmlFiles]
+    input_types = [SingleFile, HtmlFiles, EpubInParts]
+

    async def download(self, book: Book, location: str, update: Update) -> None:
        if isinstance(book.data, SingleFile):
            await self._download_single_file(book, location, update)
        elif isinstance(book.data, HtmlFiles):
            await self._download_html_files(book.data, book.metadata, location, update)
+        elif isinstance(book.data, EpubInParts):
+            await self._download_epub_in_parts(book.data, book.metadata, location, update)
        else:
            raise UnsupportedOutputFormat

+
    async def _download_html_files(self, html: HtmlFiles, metadata: Metadata, location: str, update: Update) -> None:
        output = epub.EpubBook()
        output.set_title(metadata.title)
@ -67,3 +73,59 @@ class Epub(OutputFormat):
        output.add_item(epub.EpubNcx())
        output.add_item(epub.EpubNav())
        epub.write_epub(location, output)
+
+
+    async def _download_epub_in_parts(self, data: EpubInParts, metadata: Metadata, location: str, update: Update) -> None:
+        files = data.files
+        file_count = len(files)
+        progress = 1/(file_count)
+        temporary_file_location = f"{location}.tmp"
+
+        added_files: set[str] = set()
+        def get_new_files(zipfile: ZipFile):
+            """Returns files in zipfile not already added to file"""
+            for filename in zipfile.namelist():
+                if filename in added_files or filename.endswith(".opf") or filename.endswith(".ncx"):
+                    continue
+                yield filename
+
+        output = epub.EpubBook()
+        for file in files:
+            await self._download_and_write_file(file, temporary_file_location)
+            with ZipFile(temporary_file_location, "r") as zipfile:
+                for filepath in get_new_files(zipfile):
+                    content = zipfile.read(filepath)
+                    if filepath.endswith("html"):
+                        filename = os.path.basename(filepath)
+                        is_in_toc = False
+                        title = None
+                        for key, value in data.files_in_toc.items():
+                            toc_filename = key.split("#")[0]
+                            if filename == toc_filename:
+                                title = value
+                                is_in_toc = True
+                                break
+                        epub_file = epub.EpubHtml(
+                            title = title,
+                            file_name = filepath,
+                            content = content
+                        )
+                        output.add_item(epub_file)
+                        output.spine.append(epub_file)
+                        if is_in_toc:
+                            output.toc.append(epub_file)
+                    else:
+                        epub_file = epub.EpubItem(
+                            file_name = filepath,
+                            content = content
+                        )
+                        output.add_item(epub_file)
+                    added_files.add(filepath)
+            if update:
+                update(progress)
+        os.remove(temporary_file_location)
+
+        output.add_item(epub.EpubNcx())
+        output.add_item(epub.EpubNav())
+        epub.write_epub(location, output)
+        exit()
--- a/grawlix/sources/init.py
+++ b/grawlix/sources/init.py
@ -5,6 +5,7 @@ from .ereolen import Ereolen
 from .flipp import Flipp
 from .internet_archive import InternetArchive
 from .mangaplus import MangaPlus
+from .nextory import Nextory
 from .royal_road import RoyalRoad
 from .saxo import Saxo
 from .webtoons import Webtoons
@ -56,6 +57,7 @@ def get_source_classes() -> list[type[Source]]:
        Flipp,
        InternetArchive,
        MangaPlus,
+        Nextory,
        RoyalRoad,
        Saxo,
        Webtoons
--- a/grawlix/sources/nextory.py
+++ b/grawlix/sources/nextory.py
@ -0,0 +1,227 @@
+from grawlix.book import Book, Metadata, OnlineFile, BookData, OnlineFile, SingleFile, EpubInParts, Result, Series
+from grawlix.encryption import AESEncryption
+from grawlix.exceptions import InvalidUrl
+from .source import Source
+
+from typing import Optional
+import uuid
+import rich
+import base64
+
+LOCALE = "en_GB"
+
+class Nextory(Source):
+    name: str = "Nextory"
+    match = [
+        r"https?://((www|catalog-\w\w).)?nextory.+"
+    ]
+    _authentication_methods = [ "login" ]
+
+
+    @staticmethod
+    def _create_device_id() -> str:
+        """Create unique device id"""
+        return str(uuid.uuid3(uuid.NAMESPACE_DNS, "audiobook-dl"))
+
+
+    async def login(self, username: str, password: str, **kwargs) -> None:
+        # Set permanent headers
+        device_id = self._create_device_id()
+        self._client.headers.update(
+            {
+                "X-Application-Id": "200",
+                "X-App-Version": "5.0.0",
+                "X-Locale": LOCALE,
+                "X-Model": "Personal Computer",
+                "X-Device-Id": device_id,
+                "locale": LOCALE,
+                "device": device_id,
+                "osinfo": "Android 13",
+                "model": "Personal Computer",
+                "version": "4.34.6",
+                "appid": "200",
+            }
+        )
+        # Login for account
+        session_response = await self._client.post(
+            "https://api.nextory.com/user/v1/sessions",
+            json = {
+                "identifier": username,
+                "password": password
+            },
+        )
+        session_response = session_response.json()
+        rich.print(session_response)
+        login_token = session_response["login_token"]
+        country = session_response["country"]
+        self._client.headers.update(
+            {
+                "token": login_token,
+                "X-Login-Token": login_token,
+                "X-Country-Code": country,
+            }
+        )
+        # Login for user
+        profiles_response = await self._client.get(
+            "https://api.nextory.com/user/v1/me/profiles",
+        )
+        profiles_response = profiles_response.json()
+        rich.print(profiles_response)
+        profile = profiles_response["profiles"][0]
+        login_key = profile["login_key"]
+        authorize_response = await self._client.post(
+            "https://api.nextory.com/user/v1/profile/authorize",
+            json = {
+                "login_key": login_key
+            }
+        )
+        authorize_response = authorize_response.json()
+        rich.print(authorize_response)
+        profile_token = authorize_response["profile_token"]
+        self._client.headers.update({"X-Profile-Token": profile_token})
+        self._client.headers.update({"X-Profile-Token": profile_token})
+
+
+    @staticmethod
+    def _find_epub_id(product_data) -> str:
+        """Find id of book format of type epub for given book"""
+        for format in product_data["formats"]:
+            if format["type"] == "epub":
+                return format["identifier"]
+        raise InvalidUrl
+
+
+    @staticmethod
+    def _extract_id_from_url(url: str) -> str:
+        """
+        Extract id of book from url. This id is not always the internal id for
+        the book.
+
+        :param url: Url to book information page
+        :return: Id in url
+        """
+        return url.split("-")[-1].replace("/", "")
+
+
+    async def download(self, url: str) -> Result:
+        url_id = self._extract_id_from_url(url)
+        if "serier" in url:
+            return await self._download_series(url_id)
+        else:
+            book_id = await self._get_book_id_from_url_id(url_id)
+            return await self._download_book(book_id)
+
+
+    async def download_book_from_id(self, book_id: str) -> Book:
+        return await self._download_book(book_id)
+
+
+    async def _download_series(self, series_id: str) -> Series:
+        """
+        Download series from Nextory
+
+        :param series_id: Id of series on Nextory
+        :returns: Series data
+        """
+        response = await self._client.get(
+            f"https://api.nextory.com/discovery/v1/series/{series_id}/products",
+            params = {
+                "content_type": "book",
+                "page": 0,
+                "per": 100,
+            }
+        )
+        series_data = response.json()
+        book_ids = []
+        for book in series_data["products"]:
+            book_id = book["id"]
+            book_ids.append(book_id)
+        return Series(
+            title = series_data["products"][0]["series"]["name"],
+            book_ids = book_ids,
+        )
+
+
+    @staticmethod
+    def _extract_series_name(product_info: dict) -> Optional[str]:
+        if not "series" in product_info:
+            return None
+        return product_info["series"]["name"]
+
+
+    async def _get_book_id_from_url_id(self, url_id: str) -> str:
+        """
+        Download book id from url id
+
+        :param url_id: Id of book from url
+        :return: Book id
+        """
+        response = await self._client.get(
+            f"https://api.nextory.se/api/app/product/7.5/bookinfo",
+            params = { "id": url_id },
+        )
+        rich.print(response.url)
+        rich.print(response.content)
+        exit()
+
+
+    async def _download_book(self, book_id: str) -> Book:
+        product_data = await self._client.get(
+            f"https://api.nextory.com/library/v1/products/{book_id}"
+        )
+        product_data = product_data.json()
+        epub_id = self._find_epub_id(product_data)
+        pages = await self._get_pages(epub_id)
+        return Book(
+            data = pages,
+            metadata = Metadata(
+                title = product_data["title"],
+                authors = [author["name"] for author in product_data["authors"]],
+                series = self._extract_series_name(product_data),
+            )
+        )
+
+
+    @staticmethod
+    def _fix_key(value: str) -> bytes:
+        """Remove unused data and decode key"""
+        return base64.b64decode(value[:-1])
+
+
+    async def _get_pages(self, epub_id: str) -> BookData:
+        """
+        Download page information for book
+
+        :param epub_id: Id of epub file
+        :return: Page data
+        """
+        # Nextory books are for some reason split up into multiple epub files -
+        # one for each chapter file. All of these files has to be decrypted and
+        # combined afterwards. Many of the provided epub files contain the same
+        # files and some of them contain the same file names but with variation
+        # in the content and comments that describe what should have been there
+        # if the book was whole from the start.
+        response = await self._client.get(
+            f"https://api.nextory.com/reader/books/{epub_id}/packages/epub"
+        )
+        epub_data = response.json()
+        encryption = AESEncryption(
+            key = self._fix_key(epub_data["crypt_key"]),
+            iv = self._fix_key(epub_data["crypt_iv"])
+        )
+        files = []
+        for part in epub_data["spines"]:
+            files.append(
+                OnlineFile(
+                    url = part["spine_url"],
+                    extension = "epub",
+                    encryption = encryption
+                )
+            )
+        files_in_toc = {}
+        for item in epub_data["toc"]["childrens"]: # Why is it "childrens"?
+            files_in_toc[item["src"]] = item["name"]
+        return EpubInParts(
+            files,
+            files_in_toc
+        )