Merge branch 'master' of github.com:jo1gi/grawlix

2025-12-16 04:09:10 +00:00 · 2023-05-08 12:46:31 +02:00 · 2023-05-08 12:46:31 +02:00 · 4c32ac847f
commit 4c32ac847f
parent f9919fc48b f91a32b0ab
12 changed files with 236 additions and 64 deletions
--- a/README.md
+++ b/README.md
@ -3,12 +3,14 @@
 ![GitHub top language](https://img.shields.io/github/languages/top/jo1gi/grawlix)
 ![License](https://img.shields.io/github/license/jo1gi/grawlix)
 [![Donate using Ko-Fi](https://img.shields.io/badge/donate-kofi-00b9fe?logo=ko-fi&logoColor=00b9fe)](https://ko-fi.com/jo1gi)
+
 CLI ebook downloader

 ## Supported services
 grawlix currently supports downloading from the following sources:
 - [eReolen](https://ereolen.dk)
 - [Flipp](https://flipp.dk)
+- [Internet Archive](https://archive.org)
 - [Manga Plus](https://mangaplus.shueisha.co.jp)
 - [Royal Road](https://www.royalroad.com)
 - [Saxo](https://saxo.com)
--- a/grawlix/main.py
+++ b/grawlix/main.py
@ -20,7 +20,7 @@ def get_login(source: Source, config: Config, options) -> Tuple[str, str, Option
    :param options: Command line options
    :returns: Login credentials
    """
-    source_name = source.name.lower()
+    source_name = source.name.lower().replace(" ", "")
    if source_name in config.sources:
        username = config.sources[source_name].username or options.username
        password = config.sources[source_name].password or options.password
--- a/grawlix/book.py
+++ b/grawlix/book.py
@ -31,11 +31,20 @@ class OnlineFile:
    encryption: Optional[Encryption] = None
    headers: Optional[dict[str, str]] = None

+@dataclass(slots=True)
+class OfflineFile:
+    """Stores content of a file"""
+    content: bytes
+    extension: str
+    encryption: Optional[Encryption] = None
+
+File = Union[OnlineFile, OfflineFile]
+

@dataclass(slots=True)
 class SingleFile:
    """Bookdata in the form of a single file"""
-    file: OnlineFile
+    file: File


@dataclass(slots=True)
--- a/grawlix/config.py
+++ b/grawlix/config.py
@ -33,7 +33,7 @@ def load_config() -> Config:
    else:
        config_dict = {}
    sources = {}
-    if "source" in config_dict:
+    if "sources" in config_dict:
        for key, values in config_dict["sources"].items():
            sources[key] = SourceConfig (
                username = values.get("username"),
--- a/grawlix/output/init.py
+++ b/grawlix/output/init.py
@ -1,8 +1,9 @@
 from grawlix.book import Book, BookData, SingleFile, ImageList, OnlineFile, HtmlFiles
-from grawlix.exceptions import GrawlixError
+from grawlix.exceptions import GrawlixError, UnsupportedOutputFormat
 from grawlix.logging import info

 from .output_format import OutputFormat
+from .acsm import Acsm
 from .cbz import Cbz
 from .epub import Epub

@ -16,7 +17,12 @@ async def download_book(book: Book, update_func: Callable, template: str) -> Non

    :param book: Book to download
    """
-    output_format = get_default_format(book.data)
+    _, ext = os.path.splitext(template)
+    ext = ext[1:]
+    if ext in get_valid_extensions():
+        output_format = find_output_format(book, ext)()
+    else:
+        output_format = get_default_format(book)
    location = format_output_location(book, output_format, template)
    if not book.overwrite and os.path.exists(location):
        info("Skipping - File already exists")
@ -24,14 +30,7 @@ async def download_book(book: Book, update_func: Callable, template: str) -> Non
    parent = Path(location).parent
    if not parent.exists():
        os.makedirs(parent)
-    if isinstance(book.data, SingleFile):
-        await output_format.dl_single_file(book, location, update_func)
-    elif isinstance(book.data, ImageList):
-        await output_format.dl_image_list(book, location, update_func)
-    elif isinstance(book.data, HtmlFiles):
-        await output_format.dl_html_files(book, location, update_func)
-    else:
-        raise NotImplementedError
+    await output_format.download(book, location, update_func)
    await output_format.close()


@ -48,34 +47,43 @@ def format_output_location(book: Book, output_format: OutputFormat, template: st
    return template.format(**values, ext = output_format.extension)


-def get_default_format(bookdata: BookData) -> OutputFormat:
+def get_default_format(book: Book) -> OutputFormat:
    """
    Get default output format for bookdata.
    Should only be used if no format was specified by the user

-    :param bookdata: Content of book
+    :param book: Content of book
    :returns: OutputFormat object matching the default
    """
+    bookdata = book.data
    if isinstance(bookdata, SingleFile):
-        return output_format_from_str(bookdata.file.extension)
+        extension = bookdata.file.extension
    if isinstance(bookdata, ImageList):
-        return Cbz()
+        extension = "cbz"
    if isinstance(bookdata, HtmlFiles):
-        return Epub()
-    raise GrawlixError
+        extension = "epub"
+    output_format = find_output_format(book, extension)
+    return output_format()


-def output_format_from_str(name: str) -> OutputFormat:
+def find_output_format(book: Book, extension: str) -> type[OutputFormat]:
    """
-    Convert string to outputformat object
+    Find a compatible output format

-    :param name: Name of output format
-    :returns: OutputFormat object
+    :param book: Book to download
+    :param extension: Extension of output file
+    :returns: Compatible OutputFormat type
+    :raises: UnsupportedOutputFormat if nothing is found
    """
    for output_format in get_output_formats():
-        if output_format.extension == name:
-            return output_format()
-    raise GrawlixError
+        matches_extension = output_format.extension == extension
+        supports_bookdata = type(book.data) in output_format.input_types
+        if matches_extension and supports_bookdata:
+            return output_format
+    raise UnsupportedOutputFormat
+
+def get_valid_extensions() -> list[str]:
+    return [output_format.extension for output_format in get_output_formats()]


 def get_output_formats() -> list[type[OutputFormat]]:
@ -85,6 +93,7 @@ def get_output_formats() -> list[type[OutputFormat]]:
    :returns: List of available output format classes
    """
    return [
+        Acsm,
        Cbz,
        Epub,
    ]
--- a/grawlix/output/acsm.py
+++ b/grawlix/output/acsm.py
@ -0,0 +1,23 @@
+from grawlix.book import Book, SingleFile
+from .output_format import OutputFormat, Update
+import shutil
+import subprocess
+
+class Acsm(OutputFormat):
+    extension = "acsm"
+    input_types = [SingleFile]
+
+    async def download(self, book: Book, location: str, update_func: Update) -> None:
+        # Download and write acsm file to disk
+        await self._download_single_file(book, location, update_func)
+        # TODO: Implement more general solution
+        # Decrypt if knock is available
+        # https://web.archive.org/web/20221016154220/https://github.com/BentonEdmondson/knock
+        if shutil.which("knock") is not None:
+            subprocess.run(
+                ["knock", location],
+                capture_output = True
+            )
+        else:
+            # TODO: Print warning
+            pass
--- a/grawlix/output/cbz.py
+++ b/grawlix/output/cbz.py
@ -9,18 +9,21 @@ class Cbz(OutputFormat):
    """Comic book zip file"""

    extension: str = "cbz"
+    input_types = [ImageList]

-    async def dl_image_list(self, book: Book, location: str, update: Update) -> None:
+    async def download(self, book: Book, location: str, update: Update) -> None:
        if not isinstance(book.data, ImageList):
            raise UnsupportedOutputFormat
+        semaphore = asyncio.Semaphore(10)
        images = book.data.images
        image_count = len(images)
        with ZipFile(location, mode="w") as zip:
            async def download_page(index: int, file: OnlineFile):
-                content = await self._download_file(file)
-                zip.writestr(f"Image {index}.{file.extension}", content)
-                if update:
-                    update(1/image_count)
+                async with semaphore:
+                    content = await self._download_file(file)
+                    zip.writestr(f"Image {index}.{file.extension}", content)
+                    if update:
+                        update(1/image_count)
            tasks = [
                asyncio.create_task(download_page(index, file))
                for index, file in enumerate(images)
--- a/grawlix/output/epub.py
+++ b/grawlix/output/epub.py
@ -1,4 +1,4 @@
-from grawlix.book import HtmlFiles, HtmlFile, OnlineFile, Book
+from grawlix.book import HtmlFiles, HtmlFile, OnlineFile, Book, SingleFile, Metadata
 from grawlix.exceptions import UnsupportedOutputFormat
 from .output_format import OutputFormat, Update

@ -9,14 +9,20 @@ from ebooklib import epub

 class Epub(OutputFormat):
    extension = "epub"
+    input_types = [SingleFile, HtmlFiles]

-    async def dl_html_files(self, book: Book, location: str, update: Update) -> None:
-        if not isinstance(book.data, HtmlFiles):
+    async def download(self, book: Book, location: str, update: Update) -> None:
+        if isinstance(book.data, SingleFile):
+            await self._download_single_file(book, location, update)
+        elif isinstance(book.data, HtmlFiles):
+            await self._download_html_files(book.data, book.metadata, location, update)
+        else:
            raise UnsupportedOutputFormat
-        html = book.data
+
+    async def _download_html_files(self, html: HtmlFiles, metadata: Metadata, location: str, update: Update) -> None:
        output = epub.EpubBook()
-        output.set_title(book.metadata.title)
-        for author in book.metadata.authors:
+        output.set_title(metadata.title)
+        for author in metadata.authors:
            output.add_author(author)
        file_count = len(html.htmlfiles) + 1 # Html files + cover

--- a/grawlix/output/output_format.py
+++ b/grawlix/output/output_format.py
@ -1,4 +1,4 @@
-from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book
+from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book, OfflineFile, BookData
 from grawlix.exceptions import UnsupportedOutputFormat
 from grawlix.encryption import decrypt

@ -9,7 +9,8 @@ Update = Optional[Callable[[float], None]]

 class OutputFormat:
    # Extension for output files
-    extension: str = ""
+    extension: str
+    input_types: list[type[BookData]]

    def __init__(self) -> None:
        self._client = httpx.AsyncClient()
@ -20,7 +21,18 @@ class OutputFormat:
        await self._client.aclose()


-    async def dl_single_file(self, book: Book, location: str, update_func: Update) -> None:
+    async def download(self, book: Book, location: str, update_func: Update) -> None:
+        """
+        Download book
+
+        :param book: Book to download
+        :param location: Path to where the file is written
+        :param update_func: Function to update progress bar
+        """
+        raise UnsupportedOutputFormat
+
+
+    async def _download_single_file(self, book: Book, location: str, update_func: Update) -> None:
        """
        Download and write an `grawlix.SingleFile` to disk

@ -32,29 +44,10 @@ class OutputFormat:
            raise UnsupportedOutputFormat
        if not book.data.file.extension == self.extension:
            raise UnsupportedOutputFormat
-        await self._download_and_write_file(book.data.file, location, update_func)
-
-
-    async def dl_image_list(self, book: Book, location: str, update_func: Update) -> None:
-        """
-        Download and write an `grawlix.ImageList` to disk
-
-        :param book: Book to download
-        :param location: Path to where the file is written
-        :raises UnsupportedOutputFormat: If datatype is not supported by format
-        """
-        raise UnsupportedOutputFormat
-
-
-    async def dl_html_files(self, book: Book, location: str, update_func: Update) -> None:
-        """
-        Download and write a `grawlix.HtmlFiles` to disk
-
-        :param book: Book to download
-        :param location: Path to where the file is written
-        :raises UnsupportedOutputFormat: If datatype is not supported by format
-        """
-        raise UnsupportedOutputFormat
+        if isinstance(book.data.file, OnlineFile):
+            await self._download_and_write_file(book.data.file, location, update_func)
+        elif isinstance(book.data.file, OfflineFile):
+            self._write_offline_file(book.data.file, location)


    async def _download_file(self, file: OnlineFile, update: Update = None) -> bytes:
@ -88,3 +81,17 @@ class OutputFormat:
        content = await self._download_file(file, update)
        with open(location, "wb") as f:
            f.write(content)
+
+
+    def _write_offline_file(self, file: OfflineFile, location: str) -> None:
+        """
+        Write the content of an `OfflineFile` to disk
+
+        :param file: File to write to disk
+        :param location: Path to where the file is written
+        """
+        with open(location, "wb") as f:
+            content = file.content
+            if file.encryption:
+                content = decrypt(content, file.encryption)
+            f.write(content)
--- a/grawlix/sources/init.py
+++ b/grawlix/sources/init.py
@ -3,6 +3,7 @@ from grawlix.exceptions import NoSourceFound
 from .source import Source
 from .ereolen import Ereolen
 from .flipp import Flipp
+from .internet_archive import InternetArchive
 from .mangaplus import MangaPlus
 from .royal_road import RoyalRoad
 from .saxo import Saxo
@ -53,6 +54,7 @@ def get_source_classes() -> list[type[Source]]:
    return [
        Ereolen,
        Flipp,
+        InternetArchive,
        MangaPlus,
        RoyalRoad,
        Saxo,
--- a/grawlix/sources/ereolen.py
+++ b/grawlix/sources/ereolen.py
@ -2,8 +2,8 @@ from grawlix.book import Result, Book, SingleFile, Metadata, OnlineFile
 from grawlix.encryption import AESCTREncryption
 from grawlix.exceptions import InvalidUrl, DataNotFound
 from grawlix.utils import nearest_string
-
 from .source import Source
+
 from bs4 import BeautifulSoup
 import json
 import re
--- a/grawlix/sources/internet_archive.py
+++ b/grawlix/sources/internet_archive.py
@ -0,0 +1,111 @@
+from grawlix.book import Book, SingleFile, Metadata, OfflineFile
+from .source import Source
+
+import random
+import string
+from bs4 import BeautifulSoup
+import asyncio
+
+class InternetArchive(Source):
+    name: str = "Internet Archive"
+    match: list[str] = [
+        r"https://archive.org/details/.+"
+    ]
+    _authentication_methods = [ "login", "cookies" ]
+    _login_credentials = [ "username", "password" ]
+
+    @staticmethod
+    def _format_data(content_type: str, fields):
+        data = ""
+        for name, value in fields.items():
+            data += f"--{content_type}\x0d\x0aContent-Disposition: form-data; name=\"{name}\"\x0d\x0a\x0d\x0a{value}\x0d\x0a"
+        data += content_type+"--"
+        return data
+
+
+    async def login(self, username: str, password: str, **kwargs) -> None:
+        await self._client.get("https://archive.org/account/login")
+        content_type = "----WebKitFormBoundary"+"".join(random.sample(string.ascii_letters + string.digits, 16))
+        headers = {'Content-Type': 'multipart/form-data; boundary='+content_type}
+        data = self._format_data(content_type, {"username":username, "password":password, "submit_by_js":"true"})
+        response = await self._client.post(
+            "https://archive.org/account/login",
+            data=data,
+            headers=headers
+        )
+        if not "Successful login" in response.text:
+            print("Failed login")
+            exit(1)
+
+
+    async def _download_acsm(self, book_id: str) -> bytes:
+        """
+        Loan book on archive.org and download acsm file
+
+        :param book_id: Id of book
+        """
+        await self._client.post(
+            "https://archive.org/services/loans/loan/searchInside.php",
+            data = {
+                "action": "grant_access",
+                "identifier": book_id
+            }
+        )
+        await self._client.post(
+            "https://archive.org/services/loans/loan/",
+            data = {
+                "action": "browse_book",
+                "identifier": book_id
+            }
+        )
+        # TODO: Error handling
+        await self._client.post(
+            "https://archive.org/services/loans/loan/",
+            data = {
+                "action": "create_token",
+                "identifier": book_id
+            }
+        )
+        acsm_response = await self._client.get(
+            f"https://archive.org/services/loans/loan/?action=media_url&identifier={book_id}&format=pdf&redirect=1",
+            follow_redirects = True
+        )
+        return acsm_response.content
+
+
+    async def download(self, url: str) -> Book:
+        book_id = url.split("/")[4]
+        metadata, acsm_file = await asyncio.gather(
+            self._download_metadata(book_id),
+            self._download_acsm(book_id)
+        )
+        return Book(
+            data = SingleFile(
+                OfflineFile(
+                    content = acsm_file,
+                    extension = "acsm",
+                )
+            ),
+            metadata = Metadata(
+                title = metadata["title"],
+                authors = [ metadata.get("creator") ] if "creator" in metadata else []
+            )
+        )
+
+
+    async def _download_metadata(self, book_id: str) -> dict:
+        """
+        Download metadata for book
+
+        :param book_id: Id of book
+        :returns: Dictionary with metadata
+        """
+        page_response = await self._client.get(
+            f"https://archive.org/details/{book_id}"
+        )
+        soup = BeautifulSoup(page_response.text, "lxml")
+        metadata_url = soup.find("ia-book-theater").get("bookmanifesturl")
+        metadata_response = await self._client.get(
+            f"https:{metadata_url}"
+        )
+        return metadata_response.json()["data"]["metadata"]