diff --git a/README.md b/README.md index 36bc222..dbbd7b8 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ CLI ebook downloader grawlix currently supports downloading from the following sources: - [eReolen](https://ereolen.dk) - [Flipp](https://flipp.dk) +- [Internet Archive](https://archive.org) - [Manga Plus](https://mangaplus.shueisha.co.jp) - [Royal Road](https://www.royalroad.com) - [Saxo](https://saxo.com) diff --git a/grawlix/__main__.py b/grawlix/__main__.py index 62590c9..6439a5d 100644 --- a/grawlix/__main__.py +++ b/grawlix/__main__.py @@ -20,7 +20,7 @@ def get_login(source: Source, config: Config, options) -> Tuple[str, str, Option :param options: Command line options :returns: Login credentials """ - source_name = source.name.lower() + source_name = source.name.lower().replace(" ", "") if source_name in config.sources: username = config.sources[source_name].username or options.username password = config.sources[source_name].password or options.password diff --git a/grawlix/book.py b/grawlix/book.py index c1ec9d8..ae49564 100644 --- a/grawlix/book.py +++ b/grawlix/book.py @@ -31,11 +31,20 @@ class OnlineFile: encryption: Optional[Encryption] = None headers: Optional[dict[str, str]] = None +@dataclass(slots=True) +class OfflineFile: + """Stores content of a file""" + content: bytes + extension: str + encryption: Optional[Encryption] = None + +File = Union[OnlineFile, OfflineFile] + @dataclass(slots=True) class SingleFile: """Bookdata in the form of a single file""" - file: OnlineFile + file: File @dataclass(slots=True) diff --git a/grawlix/output/__init__.py b/grawlix/output/__init__.py index 33f5c2c..fcca4b9 100644 --- a/grawlix/output/__init__.py +++ b/grawlix/output/__init__.py @@ -3,6 +3,7 @@ from grawlix.exceptions import GrawlixError from grawlix.logging import info from .output_format import OutputFormat +from .acsm import Acsm from .cbz import Cbz from .epub import Epub @@ -85,6 +86,7 @@ def get_output_formats() -> list[type[OutputFormat]]: :returns: List of available output format classes """ return [ + Acsm, Cbz, Epub, ] diff --git a/grawlix/output/acsm.py b/grawlix/output/acsm.py new file mode 100644 index 0000000..ca5230f --- /dev/null +++ b/grawlix/output/acsm.py @@ -0,0 +1,22 @@ +from grawlix.book import Book +from .output_format import OutputFormat, Update +import shutil +import subprocess + +class Acsm(OutputFormat): + extension = "acsm" + + async def dl_single_file(self, book: Book, location: str, update_func: Update) -> None: + # Download and write acsm file to disk + await super().dl_single_file(book, location, update_func) + # TODO: Implement more general solution + # Decrypt if knock is available + # https://web.archive.org/web/20221016154220/https://github.com/BentonEdmondson/knock + if shutil.which("knock") is not None: + subprocess.run( + ["knock", location], + capture_output = True + ) + else: + # TODO: Print warning + pass diff --git a/grawlix/output/output_format.py b/grawlix/output/output_format.py index 83406e6..f9a60ea 100644 --- a/grawlix/output/output_format.py +++ b/grawlix/output/output_format.py @@ -1,4 +1,4 @@ -from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book +from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book, OfflineFile from grawlix.exceptions import UnsupportedOutputFormat from grawlix.encryption import decrypt @@ -9,7 +9,7 @@ Update = Optional[Callable[[float], None]] class OutputFormat: # Extension for output files - extension: str = "" + extension: str def __init__(self) -> None: self._client = httpx.AsyncClient() @@ -32,7 +32,10 @@ class OutputFormat: raise UnsupportedOutputFormat if not book.data.file.extension == self.extension: raise UnsupportedOutputFormat - await self._download_and_write_file(book.data.file, location, update_func) + if isinstance(book.data.file, OnlineFile): + await self._download_and_write_file(book.data.file, location, update_func) + elif isinstance(book.data.file, OfflineFile): + self._write_offline_file(book.data.file, location) async def dl_image_list(self, book: Book, location: str, update_func: Update) -> None: @@ -88,3 +91,17 @@ class OutputFormat: content = await self._download_file(file, update) with open(location, "wb") as f: f.write(content) + + + def _write_offline_file(self, file: OfflineFile, location: str) -> None: + """ + Write the content of an `OfflineFile` to disk + + :param file: File to write to disk + :param location: Path to where the file is written + """ + with open(location, "wb") as f: + content = file.content + if file.encryption: + content = decrypt(content, file.encryption) + f.write(content) diff --git a/grawlix/sources/__init__.py b/grawlix/sources/__init__.py index cd87be7..7e1fbf4 100644 --- a/grawlix/sources/__init__.py +++ b/grawlix/sources/__init__.py @@ -3,6 +3,7 @@ from grawlix.exceptions import NoSourceFound from .source import Source from .ereolen import Ereolen from .flipp import Flipp +from .internet_archive import InternetArchive from .mangaplus import MangaPlus from .royal_road import RoyalRoad from .saxo import Saxo @@ -53,6 +54,7 @@ def get_source_classes() -> list[type[Source]]: return [ Ereolen, Flipp, + InternetArchive, MangaPlus, RoyalRoad, Saxo, diff --git a/grawlix/sources/ereolen.py b/grawlix/sources/ereolen.py index 0494b90..1e16866 100644 --- a/grawlix/sources/ereolen.py +++ b/grawlix/sources/ereolen.py @@ -2,8 +2,8 @@ from grawlix.book import Result, Book, SingleFile, Metadata, OnlineFile from grawlix.encryption import AESCTREncryption from grawlix.exceptions import InvalidUrl, DataNotFound from grawlix.utils import nearest_string - from .source import Source + from bs4 import BeautifulSoup import json import re diff --git a/grawlix/sources/internet_archive.py b/grawlix/sources/internet_archive.py new file mode 100644 index 0000000..d158723 --- /dev/null +++ b/grawlix/sources/internet_archive.py @@ -0,0 +1,111 @@ +from grawlix.book import Book, SingleFile, Metadata, OfflineFile +from .source import Source + +import random +import string +from bs4 import BeautifulSoup +import asyncio + +class InternetArchive(Source): + name: str = "Internet Archive" + match: list[str] = [ + r"https://archive.org/details/.+" + ] + _authentication_methods = [ "login", "cookies" ] + _login_credentials = [ "username", "password" ] + + @staticmethod + def _format_data(content_type: str, fields): + data = "" + for name, value in fields.items(): + data += f"--{content_type}\x0d\x0aContent-Disposition: form-data; name=\"{name}\"\x0d\x0a\x0d\x0a{value}\x0d\x0a" + data += content_type+"--" + return data + + + async def login(self, username: str, password: str, **kwargs) -> None: + await self._client.get("https://archive.org/account/login") + content_type = "----WebKitFormBoundary"+"".join(random.sample(string.ascii_letters + string.digits, 16)) + headers = {'Content-Type': 'multipart/form-data; boundary='+content_type} + data = self._format_data(content_type, {"username":username, "password":password, "submit_by_js":"true"}) + response = await self._client.post( + "https://archive.org/account/login", + data=data, + headers=headers + ) + if not "Successful login" in response.text: + print("Failed login") + exit(1) + + + async def _download_acsm(self, book_id: str) -> bytes: + """ + Loan book on archive.org and download acsm file + + :param book_id: Id of book + """ + await self._client.post( + "https://archive.org/services/loans/loan/searchInside.php", + data = { + "action": "grant_access", + "identifier": book_id + } + ) + await self._client.post( + "https://archive.org/services/loans/loan/", + data = { + "action": "browse_book", + "identifier": book_id + } + ) + # TODO: Error handling + await self._client.post( + "https://archive.org/services/loans/loan/", + data = { + "action": "create_token", + "identifier": book_id + } + ) + acsm_response = await self._client.get( + f"https://archive.org/services/loans/loan/?action=media_url&identifier={book_id}&format=pdf&redirect=1", + follow_redirects = True + ) + return acsm_response.content + + + async def download(self, url: str) -> Book: + book_id = url.split("/")[4] + metadata, acsm_file = await asyncio.gather( + self._download_metadata(book_id), + self._download_acsm(book_id) + ) + return Book( + data = SingleFile( + OfflineFile( + content = acsm_file, + extension = "acsm", + ) + ), + metadata = Metadata( + title = metadata["title"], + authors = [ metadata.get("creator") ] if "creator" in metadata else [] + ) + ) + + + async def _download_metadata(self, book_id: str) -> dict: + """ + Download metadata for book + + :param book_id: Id of book + :returns: Dictionary with metadata + """ + page_response = await self._client.get( + f"https://archive.org/details/{book_id}" + ) + soup = BeautifulSoup(page_response.text, "lxml") + metadata_url = soup.find("ia-book-theater").get("bookmanifesturl") + metadata_response = await self._client.get( + f"https:{metadata_url}" + ) + return metadata_response.json()["data"]["metadata"]