Merge branch 'archiveorg'

2025-12-16 04:09:10 +00:00 · 2023-05-04 21:30:58 +02:00 · 2023-05-04 21:30:58 +02:00 · 73692bfe0f
commit 73692bfe0f
parent 98306c0351 fc13603873
9 changed files with 170 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -9,6 +9,7 @@ CLI ebook downloader
 grawlix currently supports downloading from the following sources:
 - [eReolen](https://ereolen.dk)
 - [Flipp](https://flipp.dk)
+- [Internet Archive](https://archive.org)
 - [Manga Plus](https://mangaplus.shueisha.co.jp)
 - [Royal Road](https://www.royalroad.com)
 - [Saxo](https://saxo.com)
--- a/grawlix/main.py
+++ b/grawlix/main.py
@ -20,7 +20,7 @@ def get_login(source: Source, config: Config, options) -> Tuple[str, str, Option
    :param options: Command line options
    :returns: Login credentials
    """
-    source_name = source.name.lower()
+    source_name = source.name.lower().replace(" ", "")
    if source_name in config.sources:
        username = config.sources[source_name].username or options.username
        password = config.sources[source_name].password or options.password
--- a/grawlix/book.py
+++ b/grawlix/book.py
@ -31,11 +31,20 @@ class OnlineFile:
    encryption: Optional[Encryption] = None
    headers: Optional[dict[str, str]] = None

+@dataclass(slots=True)
+class OfflineFile:
+    """Stores content of a file"""
+    content: bytes
+    extension: str
+    encryption: Optional[Encryption] = None
+
+File = Union[OnlineFile, OfflineFile]
+

@dataclass(slots=True)
 class SingleFile:
    """Bookdata in the form of a single file"""
-    file: OnlineFile
+    file: File


@dataclass(slots=True)
--- a/grawlix/output/init.py
+++ b/grawlix/output/init.py
@ -3,6 +3,7 @@ from grawlix.exceptions import GrawlixError
 from grawlix.logging import info

 from .output_format import OutputFormat
+from .acsm import Acsm
 from .cbz import Cbz
 from .epub import Epub

@ -85,6 +86,7 @@ def get_output_formats() -> list[type[OutputFormat]]:
    :returns: List of available output format classes
    """
    return [
+        Acsm,
        Cbz,
        Epub,
    ]
--- a/grawlix/output/acsm.py
+++ b/grawlix/output/acsm.py
@ -0,0 +1,22 @@
+from grawlix.book import Book
+from .output_format import OutputFormat, Update
+import shutil
+import subprocess
+
+class Acsm(OutputFormat):
+    extension = "acsm"
+
+    async def dl_single_file(self, book: Book, location: str, update_func: Update) -> None:
+        # Download and write acsm file to disk
+        await super().dl_single_file(book, location, update_func)
+        # TODO: Implement more general solution
+        # Decrypt if knock is available
+        # https://web.archive.org/web/20221016154220/https://github.com/BentonEdmondson/knock
+        if shutil.which("knock") is not None:
+            subprocess.run(
+                ["knock", location],
+                capture_output = True
+            )
+        else:
+            # TODO: Print warning
+            pass
--- a/grawlix/output/output_format.py
+++ b/grawlix/output/output_format.py
@ -1,4 +1,4 @@
-from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book
+from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book, OfflineFile
 from grawlix.exceptions import UnsupportedOutputFormat
 from grawlix.encryption import decrypt

@ -9,7 +9,7 @@ Update = Optional[Callable[[float], None]]

 class OutputFormat:
    # Extension for output files
-    extension: str = ""
+    extension: str

    def __init__(self) -> None:
        self._client = httpx.AsyncClient()
@ -32,7 +32,10 @@ class OutputFormat:
            raise UnsupportedOutputFormat
        if not book.data.file.extension == self.extension:
            raise UnsupportedOutputFormat
-        await self._download_and_write_file(book.data.file, location, update_func)
+        if isinstance(book.data.file, OnlineFile):
+            await self._download_and_write_file(book.data.file, location, update_func)
+        elif isinstance(book.data.file, OfflineFile):
+            self._write_offline_file(book.data.file, location)


    async def dl_image_list(self, book: Book, location: str, update_func: Update) -> None:
@ -88,3 +91,17 @@ class OutputFormat:
        content = await self._download_file(file, update)
        with open(location, "wb") as f:
            f.write(content)
+
+
+    def _write_offline_file(self, file: OfflineFile, location: str) -> None:
+        """
+        Write the content of an `OfflineFile` to disk
+
+        :param file: File to write to disk
+        :param location: Path to where the file is written
+        """
+        with open(location, "wb") as f:
+            content = file.content
+            if file.encryption:
+                content = decrypt(content, file.encryption)
+            f.write(content)
--- a/grawlix/sources/init.py
+++ b/grawlix/sources/init.py
@ -3,6 +3,7 @@ from grawlix.exceptions import NoSourceFound
 from .source import Source
 from .ereolen import Ereolen
 from .flipp import Flipp
+from .internet_archive import InternetArchive
 from .mangaplus import MangaPlus
 from .royal_road import RoyalRoad
 from .saxo import Saxo
@ -53,6 +54,7 @@ def get_source_classes() -> list[type[Source]]:
    return [
        Ereolen,
        Flipp,
+        InternetArchive,
        MangaPlus,
        RoyalRoad,
        Saxo,
--- a/grawlix/sources/ereolen.py
+++ b/grawlix/sources/ereolen.py
@ -2,8 +2,8 @@ from grawlix.book import Result, Book, SingleFile, Metadata, OnlineFile
 from grawlix.encryption import AESCTREncryption
 from grawlix.exceptions import InvalidUrl, DataNotFound
 from grawlix.utils import nearest_string
-
 from .source import Source
+
 from bs4 import BeautifulSoup
 import json
 import re
--- a/grawlix/sources/internet_archive.py
+++ b/grawlix/sources/internet_archive.py
@ -0,0 +1,111 @@
+from grawlix.book import Book, SingleFile, Metadata, OfflineFile
+from .source import Source
+
+import random
+import string
+from bs4 import BeautifulSoup
+import asyncio
+
+class InternetArchive(Source):
+    name: str = "Internet Archive"
+    match: list[str] = [
+        r"https://archive.org/details/.+"
+    ]
+    _authentication_methods = [ "login", "cookies" ]
+    _login_credentials = [ "username", "password" ]
+
+    @staticmethod
+    def _format_data(content_type: str, fields):
+        data = ""
+        for name, value in fields.items():
+            data += f"--{content_type}\x0d\x0aContent-Disposition: form-data; name=\"{name}\"\x0d\x0a\x0d\x0a{value}\x0d\x0a"
+        data += content_type+"--"
+        return data
+
+
+    async def login(self, username: str, password: str, **kwargs) -> None:
+        await self._client.get("https://archive.org/account/login")
+        content_type = "----WebKitFormBoundary"+"".join(random.sample(string.ascii_letters + string.digits, 16))
+        headers = {'Content-Type': 'multipart/form-data; boundary='+content_type}
+        data = self._format_data(content_type, {"username":username, "password":password, "submit_by_js":"true"})
+        response = await self._client.post(
+            "https://archive.org/account/login",
+            data=data,
+            headers=headers
+        )
+        if not "Successful login" in response.text:
+            print("Failed login")
+            exit(1)
+
+
+    async def _download_acsm(self, book_id: str) -> bytes:
+        """
+        Loan book on archive.org and download acsm file
+
+        :param book_id: Id of book
+        """
+        await self._client.post(
+            "https://archive.org/services/loans/loan/searchInside.php",
+            data = {
+                "action": "grant_access",
+                "identifier": book_id
+            }
+        )
+        await self._client.post(
+            "https://archive.org/services/loans/loan/",
+            data = {
+                "action": "browse_book",
+                "identifier": book_id
+            }
+        )
+        # TODO: Error handling
+        await self._client.post(
+            "https://archive.org/services/loans/loan/",
+            data = {
+                "action": "create_token",
+                "identifier": book_id
+            }
+        )
+        acsm_response = await self._client.get(
+            f"https://archive.org/services/loans/loan/?action=media_url&identifier={book_id}&format=pdf&redirect=1",
+            follow_redirects = True
+        )
+        return acsm_response.content
+
+
+    async def download(self, url: str) -> Book:
+        book_id = url.split("/")[4]
+        metadata, acsm_file = await asyncio.gather(
+            self._download_metadata(book_id),
+            self._download_acsm(book_id)
+        )
+        return Book(
+            data = SingleFile(
+                OfflineFile(
+                    content = acsm_file,
+                    extension = "acsm",
+                )
+            ),
+            metadata = Metadata(
+                title = metadata["title"],
+                authors = [ metadata.get("creator") ] if "creator" in metadata else []
+            )
+        )
+
+
+    async def _download_metadata(self, book_id: str) -> dict:
+        """
+        Download metadata for book
+
+        :param book_id: Id of book
+        :returns: Dictionary with metadata
+        """
+        page_response = await self._client.get(
+            f"https://archive.org/details/{book_id}"
+        )
+        soup = BeautifulSoup(page_response.text, "lxml")
+        metadata_url = soup.find("ia-book-theater").get("bookmanifesturl")
+        metadata_response = await self._client.get(
+            f"https:{metadata_url}"
+        )
+        return metadata_response.json()["data"]["metadata"]