Merge branch 'archiveorg'

This commit is contained in:
Joakim Holm 2023-05-04 21:30:58 +02:00
commit 73692bfe0f
9 changed files with 170 additions and 6 deletions

View File

@ -9,6 +9,7 @@ CLI ebook downloader
grawlix currently supports downloading from the following sources: grawlix currently supports downloading from the following sources:
- [eReolen](https://ereolen.dk) - [eReolen](https://ereolen.dk)
- [Flipp](https://flipp.dk) - [Flipp](https://flipp.dk)
- [Internet Archive](https://archive.org)
- [Manga Plus](https://mangaplus.shueisha.co.jp) - [Manga Plus](https://mangaplus.shueisha.co.jp)
- [Royal Road](https://www.royalroad.com) - [Royal Road](https://www.royalroad.com)
- [Saxo](https://saxo.com) - [Saxo](https://saxo.com)

View File

@ -20,7 +20,7 @@ def get_login(source: Source, config: Config, options) -> Tuple[str, str, Option
:param options: Command line options :param options: Command line options
:returns: Login credentials :returns: Login credentials
""" """
source_name = source.name.lower() source_name = source.name.lower().replace(" ", "")
if source_name in config.sources: if source_name in config.sources:
username = config.sources[source_name].username or options.username username = config.sources[source_name].username or options.username
password = config.sources[source_name].password or options.password password = config.sources[source_name].password or options.password

View File

@ -31,11 +31,20 @@ class OnlineFile:
encryption: Optional[Encryption] = None encryption: Optional[Encryption] = None
headers: Optional[dict[str, str]] = None headers: Optional[dict[str, str]] = None
@dataclass(slots=True)
class OfflineFile:
"""Stores content of a file"""
content: bytes
extension: str
encryption: Optional[Encryption] = None
File = Union[OnlineFile, OfflineFile]
@dataclass(slots=True) @dataclass(slots=True)
class SingleFile: class SingleFile:
"""Bookdata in the form of a single file""" """Bookdata in the form of a single file"""
file: OnlineFile file: File
@dataclass(slots=True) @dataclass(slots=True)

View File

@ -3,6 +3,7 @@ from grawlix.exceptions import GrawlixError
from grawlix.logging import info from grawlix.logging import info
from .output_format import OutputFormat from .output_format import OutputFormat
from .acsm import Acsm
from .cbz import Cbz from .cbz import Cbz
from .epub import Epub from .epub import Epub
@ -85,6 +86,7 @@ def get_output_formats() -> list[type[OutputFormat]]:
:returns: List of available output format classes :returns: List of available output format classes
""" """
return [ return [
Acsm,
Cbz, Cbz,
Epub, Epub,
] ]

22
grawlix/output/acsm.py Normal file
View File

@ -0,0 +1,22 @@
from grawlix.book import Book
from .output_format import OutputFormat, Update
import shutil
import subprocess
class Acsm(OutputFormat):
extension = "acsm"
async def dl_single_file(self, book: Book, location: str, update_func: Update) -> None:
# Download and write acsm file to disk
await super().dl_single_file(book, location, update_func)
# TODO: Implement more general solution
# Decrypt if knock is available
# https://web.archive.org/web/20221016154220/https://github.com/BentonEdmondson/knock
if shutil.which("knock") is not None:
subprocess.run(
["knock", location],
capture_output = True
)
else:
# TODO: Print warning
pass

View File

@ -1,4 +1,4 @@
from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book, OfflineFile
from grawlix.exceptions import UnsupportedOutputFormat from grawlix.exceptions import UnsupportedOutputFormat
from grawlix.encryption import decrypt from grawlix.encryption import decrypt
@ -9,7 +9,7 @@ Update = Optional[Callable[[float], None]]
class OutputFormat: class OutputFormat:
# Extension for output files # Extension for output files
extension: str = "" extension: str
def __init__(self) -> None: def __init__(self) -> None:
self._client = httpx.AsyncClient() self._client = httpx.AsyncClient()
@ -32,7 +32,10 @@ class OutputFormat:
raise UnsupportedOutputFormat raise UnsupportedOutputFormat
if not book.data.file.extension == self.extension: if not book.data.file.extension == self.extension:
raise UnsupportedOutputFormat raise UnsupportedOutputFormat
await self._download_and_write_file(book.data.file, location, update_func) if isinstance(book.data.file, OnlineFile):
await self._download_and_write_file(book.data.file, location, update_func)
elif isinstance(book.data.file, OfflineFile):
self._write_offline_file(book.data.file, location)
async def dl_image_list(self, book: Book, location: str, update_func: Update) -> None: async def dl_image_list(self, book: Book, location: str, update_func: Update) -> None:
@ -88,3 +91,17 @@ class OutputFormat:
content = await self._download_file(file, update) content = await self._download_file(file, update)
with open(location, "wb") as f: with open(location, "wb") as f:
f.write(content) f.write(content)
def _write_offline_file(self, file: OfflineFile, location: str) -> None:
"""
Write the content of an `OfflineFile` to disk
:param file: File to write to disk
:param location: Path to where the file is written
"""
with open(location, "wb") as f:
content = file.content
if file.encryption:
content = decrypt(content, file.encryption)
f.write(content)

View File

@ -3,6 +3,7 @@ from grawlix.exceptions import NoSourceFound
from .source import Source from .source import Source
from .ereolen import Ereolen from .ereolen import Ereolen
from .flipp import Flipp from .flipp import Flipp
from .internet_archive import InternetArchive
from .mangaplus import MangaPlus from .mangaplus import MangaPlus
from .royal_road import RoyalRoad from .royal_road import RoyalRoad
from .saxo import Saxo from .saxo import Saxo
@ -53,6 +54,7 @@ def get_source_classes() -> list[type[Source]]:
return [ return [
Ereolen, Ereolen,
Flipp, Flipp,
InternetArchive,
MangaPlus, MangaPlus,
RoyalRoad, RoyalRoad,
Saxo, Saxo,

View File

@ -2,8 +2,8 @@ from grawlix.book import Result, Book, SingleFile, Metadata, OnlineFile
from grawlix.encryption import AESCTREncryption from grawlix.encryption import AESCTREncryption
from grawlix.exceptions import InvalidUrl, DataNotFound from grawlix.exceptions import InvalidUrl, DataNotFound
from grawlix.utils import nearest_string from grawlix.utils import nearest_string
from .source import Source from .source import Source
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import json import json
import re import re

View File

@ -0,0 +1,111 @@
from grawlix.book import Book, SingleFile, Metadata, OfflineFile
from .source import Source
import random
import string
from bs4 import BeautifulSoup
import asyncio
class InternetArchive(Source):
name: str = "Internet Archive"
match: list[str] = [
r"https://archive.org/details/.+"
]
_authentication_methods = [ "login", "cookies" ]
_login_credentials = [ "username", "password" ]
@staticmethod
def _format_data(content_type: str, fields):
data = ""
for name, value in fields.items():
data += f"--{content_type}\x0d\x0aContent-Disposition: form-data; name=\"{name}\"\x0d\x0a\x0d\x0a{value}\x0d\x0a"
data += content_type+"--"
return data
async def login(self, username: str, password: str, **kwargs) -> None:
await self._client.get("https://archive.org/account/login")
content_type = "----WebKitFormBoundary"+"".join(random.sample(string.ascii_letters + string.digits, 16))
headers = {'Content-Type': 'multipart/form-data; boundary='+content_type}
data = self._format_data(content_type, {"username":username, "password":password, "submit_by_js":"true"})
response = await self._client.post(
"https://archive.org/account/login",
data=data,
headers=headers
)
if not "Successful login" in response.text:
print("Failed login")
exit(1)
async def _download_acsm(self, book_id: str) -> bytes:
"""
Loan book on archive.org and download acsm file
:param book_id: Id of book
"""
await self._client.post(
"https://archive.org/services/loans/loan/searchInside.php",
data = {
"action": "grant_access",
"identifier": book_id
}
)
await self._client.post(
"https://archive.org/services/loans/loan/",
data = {
"action": "browse_book",
"identifier": book_id
}
)
# TODO: Error handling
await self._client.post(
"https://archive.org/services/loans/loan/",
data = {
"action": "create_token",
"identifier": book_id
}
)
acsm_response = await self._client.get(
f"https://archive.org/services/loans/loan/?action=media_url&identifier={book_id}&format=pdf&redirect=1",
follow_redirects = True
)
return acsm_response.content
async def download(self, url: str) -> Book:
book_id = url.split("/")[4]
metadata, acsm_file = await asyncio.gather(
self._download_metadata(book_id),
self._download_acsm(book_id)
)
return Book(
data = SingleFile(
OfflineFile(
content = acsm_file,
extension = "acsm",
)
),
metadata = Metadata(
title = metadata["title"],
authors = [ metadata.get("creator") ] if "creator" in metadata else []
)
)
async def _download_metadata(self, book_id: str) -> dict:
"""
Download metadata for book
:param book_id: Id of book
:returns: Dictionary with metadata
"""
page_response = await self._client.get(
f"https://archive.org/details/{book_id}"
)
soup = BeautifulSoup(page_response.text, "lxml")
metadata_url = soup.find("ia-book-theater").get("bookmanifesturl")
metadata_response = await self._client.get(
f"https:{metadata_url}"
)
return metadata_response.json()["data"]["metadata"]