mirror of
https://github.com/jo1gi/grawlix.git
synced 2025-12-16 04:09:10 +00:00
Merge branch 'archiveorg'
This commit is contained in:
commit
73692bfe0f
@ -9,6 +9,7 @@ CLI ebook downloader
|
|||||||
grawlix currently supports downloading from the following sources:
|
grawlix currently supports downloading from the following sources:
|
||||||
- [eReolen](https://ereolen.dk)
|
- [eReolen](https://ereolen.dk)
|
||||||
- [Flipp](https://flipp.dk)
|
- [Flipp](https://flipp.dk)
|
||||||
|
- [Internet Archive](https://archive.org)
|
||||||
- [Manga Plus](https://mangaplus.shueisha.co.jp)
|
- [Manga Plus](https://mangaplus.shueisha.co.jp)
|
||||||
- [Royal Road](https://www.royalroad.com)
|
- [Royal Road](https://www.royalroad.com)
|
||||||
- [Saxo](https://saxo.com)
|
- [Saxo](https://saxo.com)
|
||||||
|
|||||||
@ -20,7 +20,7 @@ def get_login(source: Source, config: Config, options) -> Tuple[str, str, Option
|
|||||||
:param options: Command line options
|
:param options: Command line options
|
||||||
:returns: Login credentials
|
:returns: Login credentials
|
||||||
"""
|
"""
|
||||||
source_name = source.name.lower()
|
source_name = source.name.lower().replace(" ", "")
|
||||||
if source_name in config.sources:
|
if source_name in config.sources:
|
||||||
username = config.sources[source_name].username or options.username
|
username = config.sources[source_name].username or options.username
|
||||||
password = config.sources[source_name].password or options.password
|
password = config.sources[source_name].password or options.password
|
||||||
|
|||||||
@ -31,11 +31,20 @@ class OnlineFile:
|
|||||||
encryption: Optional[Encryption] = None
|
encryption: Optional[Encryption] = None
|
||||||
headers: Optional[dict[str, str]] = None
|
headers: Optional[dict[str, str]] = None
|
||||||
|
|
||||||
|
@dataclass(slots=True)
|
||||||
|
class OfflineFile:
|
||||||
|
"""Stores content of a file"""
|
||||||
|
content: bytes
|
||||||
|
extension: str
|
||||||
|
encryption: Optional[Encryption] = None
|
||||||
|
|
||||||
|
File = Union[OnlineFile, OfflineFile]
|
||||||
|
|
||||||
|
|
||||||
@dataclass(slots=True)
|
@dataclass(slots=True)
|
||||||
class SingleFile:
|
class SingleFile:
|
||||||
"""Bookdata in the form of a single file"""
|
"""Bookdata in the form of a single file"""
|
||||||
file: OnlineFile
|
file: File
|
||||||
|
|
||||||
|
|
||||||
@dataclass(slots=True)
|
@dataclass(slots=True)
|
||||||
|
|||||||
@ -3,6 +3,7 @@ from grawlix.exceptions import GrawlixError
|
|||||||
from grawlix.logging import info
|
from grawlix.logging import info
|
||||||
|
|
||||||
from .output_format import OutputFormat
|
from .output_format import OutputFormat
|
||||||
|
from .acsm import Acsm
|
||||||
from .cbz import Cbz
|
from .cbz import Cbz
|
||||||
from .epub import Epub
|
from .epub import Epub
|
||||||
|
|
||||||
@ -85,6 +86,7 @@ def get_output_formats() -> list[type[OutputFormat]]:
|
|||||||
:returns: List of available output format classes
|
:returns: List of available output format classes
|
||||||
"""
|
"""
|
||||||
return [
|
return [
|
||||||
|
Acsm,
|
||||||
Cbz,
|
Cbz,
|
||||||
Epub,
|
Epub,
|
||||||
]
|
]
|
||||||
|
|||||||
22
grawlix/output/acsm.py
Normal file
22
grawlix/output/acsm.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
from grawlix.book import Book
|
||||||
|
from .output_format import OutputFormat, Update
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
class Acsm(OutputFormat):
|
||||||
|
extension = "acsm"
|
||||||
|
|
||||||
|
async def dl_single_file(self, book: Book, location: str, update_func: Update) -> None:
|
||||||
|
# Download and write acsm file to disk
|
||||||
|
await super().dl_single_file(book, location, update_func)
|
||||||
|
# TODO: Implement more general solution
|
||||||
|
# Decrypt if knock is available
|
||||||
|
# https://web.archive.org/web/20221016154220/https://github.com/BentonEdmondson/knock
|
||||||
|
if shutil.which("knock") is not None:
|
||||||
|
subprocess.run(
|
||||||
|
["knock", location],
|
||||||
|
capture_output = True
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# TODO: Print warning
|
||||||
|
pass
|
||||||
@ -1,4 +1,4 @@
|
|||||||
from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book
|
from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book, OfflineFile
|
||||||
from grawlix.exceptions import UnsupportedOutputFormat
|
from grawlix.exceptions import UnsupportedOutputFormat
|
||||||
from grawlix.encryption import decrypt
|
from grawlix.encryption import decrypt
|
||||||
|
|
||||||
@ -9,7 +9,7 @@ Update = Optional[Callable[[float], None]]
|
|||||||
|
|
||||||
class OutputFormat:
|
class OutputFormat:
|
||||||
# Extension for output files
|
# Extension for output files
|
||||||
extension: str = ""
|
extension: str
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self._client = httpx.AsyncClient()
|
self._client = httpx.AsyncClient()
|
||||||
@ -32,7 +32,10 @@ class OutputFormat:
|
|||||||
raise UnsupportedOutputFormat
|
raise UnsupportedOutputFormat
|
||||||
if not book.data.file.extension == self.extension:
|
if not book.data.file.extension == self.extension:
|
||||||
raise UnsupportedOutputFormat
|
raise UnsupportedOutputFormat
|
||||||
await self._download_and_write_file(book.data.file, location, update_func)
|
if isinstance(book.data.file, OnlineFile):
|
||||||
|
await self._download_and_write_file(book.data.file, location, update_func)
|
||||||
|
elif isinstance(book.data.file, OfflineFile):
|
||||||
|
self._write_offline_file(book.data.file, location)
|
||||||
|
|
||||||
|
|
||||||
async def dl_image_list(self, book: Book, location: str, update_func: Update) -> None:
|
async def dl_image_list(self, book: Book, location: str, update_func: Update) -> None:
|
||||||
@ -88,3 +91,17 @@ class OutputFormat:
|
|||||||
content = await self._download_file(file, update)
|
content = await self._download_file(file, update)
|
||||||
with open(location, "wb") as f:
|
with open(location, "wb") as f:
|
||||||
f.write(content)
|
f.write(content)
|
||||||
|
|
||||||
|
|
||||||
|
def _write_offline_file(self, file: OfflineFile, location: str) -> None:
|
||||||
|
"""
|
||||||
|
Write the content of an `OfflineFile` to disk
|
||||||
|
|
||||||
|
:param file: File to write to disk
|
||||||
|
:param location: Path to where the file is written
|
||||||
|
"""
|
||||||
|
with open(location, "wb") as f:
|
||||||
|
content = file.content
|
||||||
|
if file.encryption:
|
||||||
|
content = decrypt(content, file.encryption)
|
||||||
|
f.write(content)
|
||||||
|
|||||||
@ -3,6 +3,7 @@ from grawlix.exceptions import NoSourceFound
|
|||||||
from .source import Source
|
from .source import Source
|
||||||
from .ereolen import Ereolen
|
from .ereolen import Ereolen
|
||||||
from .flipp import Flipp
|
from .flipp import Flipp
|
||||||
|
from .internet_archive import InternetArchive
|
||||||
from .mangaplus import MangaPlus
|
from .mangaplus import MangaPlus
|
||||||
from .royal_road import RoyalRoad
|
from .royal_road import RoyalRoad
|
||||||
from .saxo import Saxo
|
from .saxo import Saxo
|
||||||
@ -53,6 +54,7 @@ def get_source_classes() -> list[type[Source]]:
|
|||||||
return [
|
return [
|
||||||
Ereolen,
|
Ereolen,
|
||||||
Flipp,
|
Flipp,
|
||||||
|
InternetArchive,
|
||||||
MangaPlus,
|
MangaPlus,
|
||||||
RoyalRoad,
|
RoyalRoad,
|
||||||
Saxo,
|
Saxo,
|
||||||
|
|||||||
@ -2,8 +2,8 @@ from grawlix.book import Result, Book, SingleFile, Metadata, OnlineFile
|
|||||||
from grawlix.encryption import AESCTREncryption
|
from grawlix.encryption import AESCTREncryption
|
||||||
from grawlix.exceptions import InvalidUrl, DataNotFound
|
from grawlix.exceptions import InvalidUrl, DataNotFound
|
||||||
from grawlix.utils import nearest_string
|
from grawlix.utils import nearest_string
|
||||||
|
|
||||||
from .source import Source
|
from .source import Source
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|||||||
111
grawlix/sources/internet_archive.py
Normal file
111
grawlix/sources/internet_archive.py
Normal file
@ -0,0 +1,111 @@
|
|||||||
|
from grawlix.book import Book, SingleFile, Metadata, OfflineFile
|
||||||
|
from .source import Source
|
||||||
|
|
||||||
|
import random
|
||||||
|
import string
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
class InternetArchive(Source):
|
||||||
|
name: str = "Internet Archive"
|
||||||
|
match: list[str] = [
|
||||||
|
r"https://archive.org/details/.+"
|
||||||
|
]
|
||||||
|
_authentication_methods = [ "login", "cookies" ]
|
||||||
|
_login_credentials = [ "username", "password" ]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _format_data(content_type: str, fields):
|
||||||
|
data = ""
|
||||||
|
for name, value in fields.items():
|
||||||
|
data += f"--{content_type}\x0d\x0aContent-Disposition: form-data; name=\"{name}\"\x0d\x0a\x0d\x0a{value}\x0d\x0a"
|
||||||
|
data += content_type+"--"
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
async def login(self, username: str, password: str, **kwargs) -> None:
|
||||||
|
await self._client.get("https://archive.org/account/login")
|
||||||
|
content_type = "----WebKitFormBoundary"+"".join(random.sample(string.ascii_letters + string.digits, 16))
|
||||||
|
headers = {'Content-Type': 'multipart/form-data; boundary='+content_type}
|
||||||
|
data = self._format_data(content_type, {"username":username, "password":password, "submit_by_js":"true"})
|
||||||
|
response = await self._client.post(
|
||||||
|
"https://archive.org/account/login",
|
||||||
|
data=data,
|
||||||
|
headers=headers
|
||||||
|
)
|
||||||
|
if not "Successful login" in response.text:
|
||||||
|
print("Failed login")
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
async def _download_acsm(self, book_id: str) -> bytes:
|
||||||
|
"""
|
||||||
|
Loan book on archive.org and download acsm file
|
||||||
|
|
||||||
|
:param book_id: Id of book
|
||||||
|
"""
|
||||||
|
await self._client.post(
|
||||||
|
"https://archive.org/services/loans/loan/searchInside.php",
|
||||||
|
data = {
|
||||||
|
"action": "grant_access",
|
||||||
|
"identifier": book_id
|
||||||
|
}
|
||||||
|
)
|
||||||
|
await self._client.post(
|
||||||
|
"https://archive.org/services/loans/loan/",
|
||||||
|
data = {
|
||||||
|
"action": "browse_book",
|
||||||
|
"identifier": book_id
|
||||||
|
}
|
||||||
|
)
|
||||||
|
# TODO: Error handling
|
||||||
|
await self._client.post(
|
||||||
|
"https://archive.org/services/loans/loan/",
|
||||||
|
data = {
|
||||||
|
"action": "create_token",
|
||||||
|
"identifier": book_id
|
||||||
|
}
|
||||||
|
)
|
||||||
|
acsm_response = await self._client.get(
|
||||||
|
f"https://archive.org/services/loans/loan/?action=media_url&identifier={book_id}&format=pdf&redirect=1",
|
||||||
|
follow_redirects = True
|
||||||
|
)
|
||||||
|
return acsm_response.content
|
||||||
|
|
||||||
|
|
||||||
|
async def download(self, url: str) -> Book:
|
||||||
|
book_id = url.split("/")[4]
|
||||||
|
metadata, acsm_file = await asyncio.gather(
|
||||||
|
self._download_metadata(book_id),
|
||||||
|
self._download_acsm(book_id)
|
||||||
|
)
|
||||||
|
return Book(
|
||||||
|
data = SingleFile(
|
||||||
|
OfflineFile(
|
||||||
|
content = acsm_file,
|
||||||
|
extension = "acsm",
|
||||||
|
)
|
||||||
|
),
|
||||||
|
metadata = Metadata(
|
||||||
|
title = metadata["title"],
|
||||||
|
authors = [ metadata.get("creator") ] if "creator" in metadata else []
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _download_metadata(self, book_id: str) -> dict:
|
||||||
|
"""
|
||||||
|
Download metadata for book
|
||||||
|
|
||||||
|
:param book_id: Id of book
|
||||||
|
:returns: Dictionary with metadata
|
||||||
|
"""
|
||||||
|
page_response = await self._client.get(
|
||||||
|
f"https://archive.org/details/{book_id}"
|
||||||
|
)
|
||||||
|
soup = BeautifulSoup(page_response.text, "lxml")
|
||||||
|
metadata_url = soup.find("ia-book-theater").get("bookmanifesturl")
|
||||||
|
metadata_response = await self._client.get(
|
||||||
|
f"https:{metadata_url}"
|
||||||
|
)
|
||||||
|
return metadata_response.json()["data"]["metadata"]
|
||||||
Loading…
Reference in New Issue
Block a user