mirror of
https://github.com/jo1gi/grawlix.git
synced 2025-12-16 04:09:10 +00:00
Merge branch 'archiveorg'
This commit is contained in:
commit
73692bfe0f
@ -9,6 +9,7 @@ CLI ebook downloader
|
||||
grawlix currently supports downloading from the following sources:
|
||||
- [eReolen](https://ereolen.dk)
|
||||
- [Flipp](https://flipp.dk)
|
||||
- [Internet Archive](https://archive.org)
|
||||
- [Manga Plus](https://mangaplus.shueisha.co.jp)
|
||||
- [Royal Road](https://www.royalroad.com)
|
||||
- [Saxo](https://saxo.com)
|
||||
|
||||
@ -20,7 +20,7 @@ def get_login(source: Source, config: Config, options) -> Tuple[str, str, Option
|
||||
:param options: Command line options
|
||||
:returns: Login credentials
|
||||
"""
|
||||
source_name = source.name.lower()
|
||||
source_name = source.name.lower().replace(" ", "")
|
||||
if source_name in config.sources:
|
||||
username = config.sources[source_name].username or options.username
|
||||
password = config.sources[source_name].password or options.password
|
||||
|
||||
@ -31,11 +31,20 @@ class OnlineFile:
|
||||
encryption: Optional[Encryption] = None
|
||||
headers: Optional[dict[str, str]] = None
|
||||
|
||||
@dataclass(slots=True)
|
||||
class OfflineFile:
|
||||
"""Stores content of a file"""
|
||||
content: bytes
|
||||
extension: str
|
||||
encryption: Optional[Encryption] = None
|
||||
|
||||
File = Union[OnlineFile, OfflineFile]
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class SingleFile:
|
||||
"""Bookdata in the form of a single file"""
|
||||
file: OnlineFile
|
||||
file: File
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
|
||||
@ -3,6 +3,7 @@ from grawlix.exceptions import GrawlixError
|
||||
from grawlix.logging import info
|
||||
|
||||
from .output_format import OutputFormat
|
||||
from .acsm import Acsm
|
||||
from .cbz import Cbz
|
||||
from .epub import Epub
|
||||
|
||||
@ -85,6 +86,7 @@ def get_output_formats() -> list[type[OutputFormat]]:
|
||||
:returns: List of available output format classes
|
||||
"""
|
||||
return [
|
||||
Acsm,
|
||||
Cbz,
|
||||
Epub,
|
||||
]
|
||||
|
||||
22
grawlix/output/acsm.py
Normal file
22
grawlix/output/acsm.py
Normal file
@ -0,0 +1,22 @@
|
||||
from grawlix.book import Book
|
||||
from .output_format import OutputFormat, Update
|
||||
import shutil
|
||||
import subprocess
|
||||
|
||||
class Acsm(OutputFormat):
|
||||
extension = "acsm"
|
||||
|
||||
async def dl_single_file(self, book: Book, location: str, update_func: Update) -> None:
|
||||
# Download and write acsm file to disk
|
||||
await super().dl_single_file(book, location, update_func)
|
||||
# TODO: Implement more general solution
|
||||
# Decrypt if knock is available
|
||||
# https://web.archive.org/web/20221016154220/https://github.com/BentonEdmondson/knock
|
||||
if shutil.which("knock") is not None:
|
||||
subprocess.run(
|
||||
["knock", location],
|
||||
capture_output = True
|
||||
)
|
||||
else:
|
||||
# TODO: Print warning
|
||||
pass
|
||||
@ -1,4 +1,4 @@
|
||||
from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book
|
||||
from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book, OfflineFile
|
||||
from grawlix.exceptions import UnsupportedOutputFormat
|
||||
from grawlix.encryption import decrypt
|
||||
|
||||
@ -9,7 +9,7 @@ Update = Optional[Callable[[float], None]]
|
||||
|
||||
class OutputFormat:
|
||||
# Extension for output files
|
||||
extension: str = ""
|
||||
extension: str
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._client = httpx.AsyncClient()
|
||||
@ -32,7 +32,10 @@ class OutputFormat:
|
||||
raise UnsupportedOutputFormat
|
||||
if not book.data.file.extension == self.extension:
|
||||
raise UnsupportedOutputFormat
|
||||
await self._download_and_write_file(book.data.file, location, update_func)
|
||||
if isinstance(book.data.file, OnlineFile):
|
||||
await self._download_and_write_file(book.data.file, location, update_func)
|
||||
elif isinstance(book.data.file, OfflineFile):
|
||||
self._write_offline_file(book.data.file, location)
|
||||
|
||||
|
||||
async def dl_image_list(self, book: Book, location: str, update_func: Update) -> None:
|
||||
@ -88,3 +91,17 @@ class OutputFormat:
|
||||
content = await self._download_file(file, update)
|
||||
with open(location, "wb") as f:
|
||||
f.write(content)
|
||||
|
||||
|
||||
def _write_offline_file(self, file: OfflineFile, location: str) -> None:
|
||||
"""
|
||||
Write the content of an `OfflineFile` to disk
|
||||
|
||||
:param file: File to write to disk
|
||||
:param location: Path to where the file is written
|
||||
"""
|
||||
with open(location, "wb") as f:
|
||||
content = file.content
|
||||
if file.encryption:
|
||||
content = decrypt(content, file.encryption)
|
||||
f.write(content)
|
||||
|
||||
@ -3,6 +3,7 @@ from grawlix.exceptions import NoSourceFound
|
||||
from .source import Source
|
||||
from .ereolen import Ereolen
|
||||
from .flipp import Flipp
|
||||
from .internet_archive import InternetArchive
|
||||
from .mangaplus import MangaPlus
|
||||
from .royal_road import RoyalRoad
|
||||
from .saxo import Saxo
|
||||
@ -53,6 +54,7 @@ def get_source_classes() -> list[type[Source]]:
|
||||
return [
|
||||
Ereolen,
|
||||
Flipp,
|
||||
InternetArchive,
|
||||
MangaPlus,
|
||||
RoyalRoad,
|
||||
Saxo,
|
||||
|
||||
@ -2,8 +2,8 @@ from grawlix.book import Result, Book, SingleFile, Metadata, OnlineFile
|
||||
from grawlix.encryption import AESCTREncryption
|
||||
from grawlix.exceptions import InvalidUrl, DataNotFound
|
||||
from grawlix.utils import nearest_string
|
||||
|
||||
from .source import Source
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import re
|
||||
|
||||
111
grawlix/sources/internet_archive.py
Normal file
111
grawlix/sources/internet_archive.py
Normal file
@ -0,0 +1,111 @@
|
||||
from grawlix.book import Book, SingleFile, Metadata, OfflineFile
|
||||
from .source import Source
|
||||
|
||||
import random
|
||||
import string
|
||||
from bs4 import BeautifulSoup
|
||||
import asyncio
|
||||
|
||||
class InternetArchive(Source):
|
||||
name: str = "Internet Archive"
|
||||
match: list[str] = [
|
||||
r"https://archive.org/details/.+"
|
||||
]
|
||||
_authentication_methods = [ "login", "cookies" ]
|
||||
_login_credentials = [ "username", "password" ]
|
||||
|
||||
@staticmethod
|
||||
def _format_data(content_type: str, fields):
|
||||
data = ""
|
||||
for name, value in fields.items():
|
||||
data += f"--{content_type}\x0d\x0aContent-Disposition: form-data; name=\"{name}\"\x0d\x0a\x0d\x0a{value}\x0d\x0a"
|
||||
data += content_type+"--"
|
||||
return data
|
||||
|
||||
|
||||
async def login(self, username: str, password: str, **kwargs) -> None:
|
||||
await self._client.get("https://archive.org/account/login")
|
||||
content_type = "----WebKitFormBoundary"+"".join(random.sample(string.ascii_letters + string.digits, 16))
|
||||
headers = {'Content-Type': 'multipart/form-data; boundary='+content_type}
|
||||
data = self._format_data(content_type, {"username":username, "password":password, "submit_by_js":"true"})
|
||||
response = await self._client.post(
|
||||
"https://archive.org/account/login",
|
||||
data=data,
|
||||
headers=headers
|
||||
)
|
||||
if not "Successful login" in response.text:
|
||||
print("Failed login")
|
||||
exit(1)
|
||||
|
||||
|
||||
async def _download_acsm(self, book_id: str) -> bytes:
|
||||
"""
|
||||
Loan book on archive.org and download acsm file
|
||||
|
||||
:param book_id: Id of book
|
||||
"""
|
||||
await self._client.post(
|
||||
"https://archive.org/services/loans/loan/searchInside.php",
|
||||
data = {
|
||||
"action": "grant_access",
|
||||
"identifier": book_id
|
||||
}
|
||||
)
|
||||
await self._client.post(
|
||||
"https://archive.org/services/loans/loan/",
|
||||
data = {
|
||||
"action": "browse_book",
|
||||
"identifier": book_id
|
||||
}
|
||||
)
|
||||
# TODO: Error handling
|
||||
await self._client.post(
|
||||
"https://archive.org/services/loans/loan/",
|
||||
data = {
|
||||
"action": "create_token",
|
||||
"identifier": book_id
|
||||
}
|
||||
)
|
||||
acsm_response = await self._client.get(
|
||||
f"https://archive.org/services/loans/loan/?action=media_url&identifier={book_id}&format=pdf&redirect=1",
|
||||
follow_redirects = True
|
||||
)
|
||||
return acsm_response.content
|
||||
|
||||
|
||||
async def download(self, url: str) -> Book:
|
||||
book_id = url.split("/")[4]
|
||||
metadata, acsm_file = await asyncio.gather(
|
||||
self._download_metadata(book_id),
|
||||
self._download_acsm(book_id)
|
||||
)
|
||||
return Book(
|
||||
data = SingleFile(
|
||||
OfflineFile(
|
||||
content = acsm_file,
|
||||
extension = "acsm",
|
||||
)
|
||||
),
|
||||
metadata = Metadata(
|
||||
title = metadata["title"],
|
||||
authors = [ metadata.get("creator") ] if "creator" in metadata else []
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
async def _download_metadata(self, book_id: str) -> dict:
|
||||
"""
|
||||
Download metadata for book
|
||||
|
||||
:param book_id: Id of book
|
||||
:returns: Dictionary with metadata
|
||||
"""
|
||||
page_response = await self._client.get(
|
||||
f"https://archive.org/details/{book_id}"
|
||||
)
|
||||
soup = BeautifulSoup(page_response.text, "lxml")
|
||||
metadata_url = soup.find("ia-book-theater").get("bookmanifesturl")
|
||||
metadata_response = await self._client.get(
|
||||
f"https:{metadata_url}"
|
||||
)
|
||||
return metadata_response.json()["data"]["metadata"]
|
||||
Loading…
Reference in New Issue
Block a user