Merge branch 'archiveorg'

2025-12-16 04:09:10 +00:00 · 2023-05-04 21:30:58 +02:00 · 2023-05-04 21:30:58 +02:00 · 73692bfe0f
commit 73692bfe0f
parent 98306c0351 fc13603873
9 changed files with 170 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -9,6 +9,7 @@ CLI ebook downloader
 grawlix currently supports downloading from the following sources:
 - [eReolen](https://ereolen.dk)
 - [Flipp](https://flipp.dk)
 - [Internet Archive](https://archive.org)
 - [Manga Plus](https://mangaplus.shueisha.co.jp)
 - [Royal Road](https://www.royalroad.com)
 - [Saxo](https://saxo.com)
--- a/grawlix/main.py
+++ b/grawlix/main.py
@ -20,7 +20,7 @@ def get_login(source: Source, config: Config, options) -> Tuple[str, str, Option
    :param options: Command line options
    :returns: Login credentials
    """
-    source_name = source.name.lower()
+    source_name = source.name.lower().replace(" ", "")
    if source_name in config.sources:
        username = config.sources[source_name].username or options.username
        password = config.sources[source_name].password or options.password
--- a/grawlix/book.py
+++ b/grawlix/book.py
@ -31,11 +31,20 @@ class OnlineFile:
    encryption: Optional[Encryption] = None
    headers: Optional[dict[str, str]] = None
@dataclass(slots=True)
 class OfflineFile:
    """Stores content of a file"""
    content: bytes
    extension: str
    encryption: Optional[Encryption] = None
 File = Union[OnlineFile, OfflineFile]
@dataclass(slots=True)
 class SingleFile:
    """Bookdata in the form of a single file"""
-    file: OnlineFile
+    file: File
@dataclass(slots=True)
--- a/grawlix/output/init.py
+++ b/grawlix/output/init.py
@ -3,6 +3,7 @@ from grawlix.exceptions import GrawlixError
 from grawlix.logging import info
 from .output_format import OutputFormat
 from .acsm import Acsm
 from .cbz import Cbz
 from .epub import Epub
@ -85,6 +86,7 @@ def get_output_formats() -> list[type[OutputFormat]]:
    :returns: List of available output format classes
    """
    return [
        Acsm,
        Cbz,
        Epub,
    ]
--- a/grawlix/output/acsm.py
+++ b/grawlix/output/acsm.py
@ -0,0 +1,22 @@
 from grawlix.book import Book
 from .output_format import OutputFormat, Update
 import shutil
 import subprocess
 class Acsm(OutputFormat):
    extension = "acsm"
    async def dl_single_file(self, book: Book, location: str, update_func: Update) -> None:
        # Download and write acsm file to disk
        await super().dl_single_file(book, location, update_func)
        # TODO: Implement more general solution
        # Decrypt if knock is available
        # https://web.archive.org/web/20221016154220/https://github.com/BentonEdmondson/knock
        if shutil.which("knock") is not None:
            subprocess.run(
                ["knock", location],
                capture_output = True
            )
        else:
            # TODO: Print warning
            pass
--- a/grawlix/output/output_format.py
+++ b/grawlix/output/output_format.py
@ -1,4 +1,4 @@
-from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book
+from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book, OfflineFile
 from grawlix.exceptions import UnsupportedOutputFormat
 from grawlix.encryption import decrypt
@ -9,7 +9,7 @@ Update = Optional[Callable[[float], None]]
 class OutputFormat:
    # Extension for output files
-    extension: str = ""
+    extension: str
    def __init__(self) -> None:
        self._client = httpx.AsyncClient()
@ -32,7 +32,10 @@ class OutputFormat:
            raise UnsupportedOutputFormat
        if not book.data.file.extension == self.extension:
            raise UnsupportedOutputFormat
-        await self._download_and_write_file(book.data.file, location, update_func)
+        if isinstance(book.data.file, OnlineFile):
            await self._download_and_write_file(book.data.file, location, update_func)
        elif isinstance(book.data.file, OfflineFile):
            self._write_offline_file(book.data.file, location)
    async def dl_image_list(self, book: Book, location: str, update_func: Update) -> None:
@ -88,3 +91,17 @@ class OutputFormat:
        content = await self._download_file(file, update)
        with open(location, "wb") as f:
            f.write(content)
    def _write_offline_file(self, file: OfflineFile, location: str) -> None:
        """
        Write the content of an `OfflineFile` to disk
        :param file: File to write to disk
        :param location: Path to where the file is written
        """
        with open(location, "wb") as f:
            content = file.content
            if file.encryption:
                content = decrypt(content, file.encryption)
            f.write(content)
--- a/grawlix/sources/init.py
+++ b/grawlix/sources/init.py
@ -3,6 +3,7 @@ from grawlix.exceptions import NoSourceFound
 from .source import Source
 from .ereolen import Ereolen
 from .flipp import Flipp
 from .internet_archive import InternetArchive
 from .mangaplus import MangaPlus
 from .royal_road import RoyalRoad
 from .saxo import Saxo
@ -53,6 +54,7 @@ def get_source_classes() -> list[type[Source]]:
    return [
        Ereolen,
        Flipp,
        InternetArchive,
        MangaPlus,
        RoyalRoad,
        Saxo,
--- a/grawlix/sources/ereolen.py
+++ b/grawlix/sources/ereolen.py
@ -2,8 +2,8 @@ from grawlix.book import Result, Book, SingleFile, Metadata, OnlineFile
 from grawlix.encryption import AESCTREncryption
 from grawlix.exceptions import InvalidUrl, DataNotFound
 from grawlix.utils import nearest_string
 from .source import Source
 from bs4 import BeautifulSoup
 import json
 import re
--- a/grawlix/sources/internet_archive.py
+++ b/grawlix/sources/internet_archive.py
@ -0,0 +1,111 @@
 from grawlix.book import Book, SingleFile, Metadata, OfflineFile
 from .source import Source
 import random
 import string
 from bs4 import BeautifulSoup
 import asyncio
 class InternetArchive(Source):
    name: str = "Internet Archive"
    match: list[str] = [
        r"https://archive.org/details/.+"
    ]
    _authentication_methods = [ "login", "cookies" ]
    _login_credentials = [ "username", "password" ]
    @staticmethod
    def _format_data(content_type: str, fields):
        data = ""
        for name, value in fields.items():
            data += f"--{content_type}\x0d\x0aContent-Disposition: form-data; name=\"{name}\"\x0d\x0a\x0d\x0a{value}\x0d\x0a"
        data += content_type+"--"
        return data
    async def login(self, username: str, password: str, **kwargs) -> None:
        await self._client.get("https://archive.org/account/login")
        content_type = "----WebKitFormBoundary"+"".join(random.sample(string.ascii_letters + string.digits, 16))
        headers = {'Content-Type': 'multipart/form-data; boundary='+content_type}
        data = self._format_data(content_type, {"username":username, "password":password, "submit_by_js":"true"})
        response = await self._client.post(
            "https://archive.org/account/login",
            data=data,
            headers=headers
        )
        if not "Successful login" in response.text:
            print("Failed login")
            exit(1)
    async def _download_acsm(self, book_id: str) -> bytes:
        """
        Loan book on archive.org and download acsm file
        :param book_id: Id of book
        """
        await self._client.post(
            "https://archive.org/services/loans/loan/searchInside.php",
            data = {
                "action": "grant_access",
                "identifier": book_id
            }
        )
        await self._client.post(
            "https://archive.org/services/loans/loan/",
            data = {
                "action": "browse_book",
                "identifier": book_id
            }
        )
        # TODO: Error handling
        await self._client.post(
            "https://archive.org/services/loans/loan/",
            data = {
                "action": "create_token",
                "identifier": book_id
            }
        )
        acsm_response = await self._client.get(
            f"https://archive.org/services/loans/loan/?action=media_url&identifier={book_id}&format=pdf&redirect=1",
            follow_redirects = True
        )
        return acsm_response.content
    async def download(self, url: str) -> Book:
        book_id = url.split("/")[4]
        metadata, acsm_file = await asyncio.gather(
            self._download_metadata(book_id),
            self._download_acsm(book_id)
        )
        return Book(
            data = SingleFile(
                OfflineFile(
                    content = acsm_file,
                    extension = "acsm",
                )
            ),
            metadata = Metadata(
                title = metadata["title"],
                authors = [ metadata.get("creator") ] if "creator" in metadata else []
            )
        )
    async def _download_metadata(self, book_id: str) -> dict:
        """
        Download metadata for book
        :param book_id: Id of book
        :returns: Dictionary with metadata
        """
        page_response = await self._client.get(
            f"https://archive.org/details/{book_id}"
        )
        soup = BeautifulSoup(page_response.text, "lxml")
        metadata_url = soup.find("ia-book-theater").get("bookmanifesturl")
        metadata_response = await self._client.get(
            f"https:{metadata_url}"
        )
        return metadata_response.json()["data"]["metadata"]