Merge branch 'archiveorg'

This commit is contained in:
Joakim Holm 2023-05-04 21:30:58 +02:00
commit 73692bfe0f
9 changed files with 170 additions and 6 deletions

View File

@ -9,6 +9,7 @@ CLI ebook downloader
grawlix currently supports downloading from the following sources:
- [eReolen](https://ereolen.dk)
- [Flipp](https://flipp.dk)
- [Internet Archive](https://archive.org)
- [Manga Plus](https://mangaplus.shueisha.co.jp)
- [Royal Road](https://www.royalroad.com)
- [Saxo](https://saxo.com)

View File

@ -20,7 +20,7 @@ def get_login(source: Source, config: Config, options) -> Tuple[str, str, Option
:param options: Command line options
:returns: Login credentials
"""
source_name = source.name.lower()
source_name = source.name.lower().replace(" ", "")
if source_name in config.sources:
username = config.sources[source_name].username or options.username
password = config.sources[source_name].password or options.password

View File

@ -31,11 +31,20 @@ class OnlineFile:
encryption: Optional[Encryption] = None
headers: Optional[dict[str, str]] = None
@dataclass(slots=True)
class OfflineFile:
"""Stores content of a file"""
content: bytes
extension: str
encryption: Optional[Encryption] = None
File = Union[OnlineFile, OfflineFile]
@dataclass(slots=True)
class SingleFile:
"""Bookdata in the form of a single file"""
file: OnlineFile
file: File
@dataclass(slots=True)

View File

@ -3,6 +3,7 @@ from grawlix.exceptions import GrawlixError
from grawlix.logging import info
from .output_format import OutputFormat
from .acsm import Acsm
from .cbz import Cbz
from .epub import Epub
@ -85,6 +86,7 @@ def get_output_formats() -> list[type[OutputFormat]]:
:returns: List of available output format classes
"""
return [
Acsm,
Cbz,
Epub,
]

22
grawlix/output/acsm.py Normal file
View File

@ -0,0 +1,22 @@
from grawlix.book import Book
from .output_format import OutputFormat, Update
import shutil
import subprocess
class Acsm(OutputFormat):
extension = "acsm"
async def dl_single_file(self, book: Book, location: str, update_func: Update) -> None:
# Download and write acsm file to disk
await super().dl_single_file(book, location, update_func)
# TODO: Implement more general solution
# Decrypt if knock is available
# https://web.archive.org/web/20221016154220/https://github.com/BentonEdmondson/knock
if shutil.which("knock") is not None:
subprocess.run(
["knock", location],
capture_output = True
)
else:
# TODO: Print warning
pass

View File

@ -1,4 +1,4 @@
from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book
from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book, OfflineFile
from grawlix.exceptions import UnsupportedOutputFormat
from grawlix.encryption import decrypt
@ -9,7 +9,7 @@ Update = Optional[Callable[[float], None]]
class OutputFormat:
# Extension for output files
extension: str = ""
extension: str
def __init__(self) -> None:
self._client = httpx.AsyncClient()
@ -32,7 +32,10 @@ class OutputFormat:
raise UnsupportedOutputFormat
if not book.data.file.extension == self.extension:
raise UnsupportedOutputFormat
await self._download_and_write_file(book.data.file, location, update_func)
if isinstance(book.data.file, OnlineFile):
await self._download_and_write_file(book.data.file, location, update_func)
elif isinstance(book.data.file, OfflineFile):
self._write_offline_file(book.data.file, location)
async def dl_image_list(self, book: Book, location: str, update_func: Update) -> None:
@ -88,3 +91,17 @@ class OutputFormat:
content = await self._download_file(file, update)
with open(location, "wb") as f:
f.write(content)
def _write_offline_file(self, file: OfflineFile, location: str) -> None:
"""
Write the content of an `OfflineFile` to disk
:param file: File to write to disk
:param location: Path to where the file is written
"""
with open(location, "wb") as f:
content = file.content
if file.encryption:
content = decrypt(content, file.encryption)
f.write(content)

View File

@ -3,6 +3,7 @@ from grawlix.exceptions import NoSourceFound
from .source import Source
from .ereolen import Ereolen
from .flipp import Flipp
from .internet_archive import InternetArchive
from .mangaplus import MangaPlus
from .royal_road import RoyalRoad
from .saxo import Saxo
@ -53,6 +54,7 @@ def get_source_classes() -> list[type[Source]]:
return [
Ereolen,
Flipp,
InternetArchive,
MangaPlus,
RoyalRoad,
Saxo,

View File

@ -2,8 +2,8 @@ from grawlix.book import Result, Book, SingleFile, Metadata, OnlineFile
from grawlix.encryption import AESCTREncryption
from grawlix.exceptions import InvalidUrl, DataNotFound
from grawlix.utils import nearest_string
from .source import Source
from bs4 import BeautifulSoup
import json
import re

View File

@ -0,0 +1,111 @@
from grawlix.book import Book, SingleFile, Metadata, OfflineFile
from .source import Source
import random
import string
from bs4 import BeautifulSoup
import asyncio
class InternetArchive(Source):
name: str = "Internet Archive"
match: list[str] = [
r"https://archive.org/details/.+"
]
_authentication_methods = [ "login", "cookies" ]
_login_credentials = [ "username", "password" ]
@staticmethod
def _format_data(content_type: str, fields):
data = ""
for name, value in fields.items():
data += f"--{content_type}\x0d\x0aContent-Disposition: form-data; name=\"{name}\"\x0d\x0a\x0d\x0a{value}\x0d\x0a"
data += content_type+"--"
return data
async def login(self, username: str, password: str, **kwargs) -> None:
await self._client.get("https://archive.org/account/login")
content_type = "----WebKitFormBoundary"+"".join(random.sample(string.ascii_letters + string.digits, 16))
headers = {'Content-Type': 'multipart/form-data; boundary='+content_type}
data = self._format_data(content_type, {"username":username, "password":password, "submit_by_js":"true"})
response = await self._client.post(
"https://archive.org/account/login",
data=data,
headers=headers
)
if not "Successful login" in response.text:
print("Failed login")
exit(1)
async def _download_acsm(self, book_id: str) -> bytes:
"""
Loan book on archive.org and download acsm file
:param book_id: Id of book
"""
await self._client.post(
"https://archive.org/services/loans/loan/searchInside.php",
data = {
"action": "grant_access",
"identifier": book_id
}
)
await self._client.post(
"https://archive.org/services/loans/loan/",
data = {
"action": "browse_book",
"identifier": book_id
}
)
# TODO: Error handling
await self._client.post(
"https://archive.org/services/loans/loan/",
data = {
"action": "create_token",
"identifier": book_id
}
)
acsm_response = await self._client.get(
f"https://archive.org/services/loans/loan/?action=media_url&identifier={book_id}&format=pdf&redirect=1",
follow_redirects = True
)
return acsm_response.content
async def download(self, url: str) -> Book:
book_id = url.split("/")[4]
metadata, acsm_file = await asyncio.gather(
self._download_metadata(book_id),
self._download_acsm(book_id)
)
return Book(
data = SingleFile(
OfflineFile(
content = acsm_file,
extension = "acsm",
)
),
metadata = Metadata(
title = metadata["title"],
authors = [ metadata.get("creator") ] if "creator" in metadata else []
)
)
async def _download_metadata(self, book_id: str) -> dict:
"""
Download metadata for book
:param book_id: Id of book
:returns: Dictionary with metadata
"""
page_response = await self._client.get(
f"https://archive.org/details/{book_id}"
)
soup = BeautifulSoup(page_response.text, "lxml")
metadata_url = soup.find("ia-book-theater").get("bookmanifesturl")
metadata_response = await self._client.get(
f"https:{metadata_url}"
)
return metadata_response.json()["data"]["metadata"]