Merge branch 'master' of github.com:jo1gi/grawlix

This commit is contained in:
Joakim Holm 2023-05-08 12:46:31 +02:00
commit 4c32ac847f
12 changed files with 236 additions and 64 deletions

View File

@ -3,12 +3,14 @@
![GitHub top language](https://img.shields.io/github/languages/top/jo1gi/grawlix)
![License](https://img.shields.io/github/license/jo1gi/grawlix)
[![Donate using Ko-Fi](https://img.shields.io/badge/donate-kofi-00b9fe?logo=ko-fi&logoColor=00b9fe)](https://ko-fi.com/jo1gi)
CLI ebook downloader
## Supported services
grawlix currently supports downloading from the following sources:
- [eReolen](https://ereolen.dk)
- [Flipp](https://flipp.dk)
- [Internet Archive](https://archive.org)
- [Manga Plus](https://mangaplus.shueisha.co.jp)
- [Royal Road](https://www.royalroad.com)
- [Saxo](https://saxo.com)

View File

@ -20,7 +20,7 @@ def get_login(source: Source, config: Config, options) -> Tuple[str, str, Option
:param options: Command line options
:returns: Login credentials
"""
source_name = source.name.lower()
source_name = source.name.lower().replace(" ", "")
if source_name in config.sources:
username = config.sources[source_name].username or options.username
password = config.sources[source_name].password or options.password

View File

@ -31,11 +31,20 @@ class OnlineFile:
encryption: Optional[Encryption] = None
headers: Optional[dict[str, str]] = None
@dataclass(slots=True)
class OfflineFile:
"""Stores content of a file"""
content: bytes
extension: str
encryption: Optional[Encryption] = None
File = Union[OnlineFile, OfflineFile]
@dataclass(slots=True)
class SingleFile:
"""Bookdata in the form of a single file"""
file: OnlineFile
file: File
@dataclass(slots=True)

View File

@ -33,7 +33,7 @@ def load_config() -> Config:
else:
config_dict = {}
sources = {}
if "source" in config_dict:
if "sources" in config_dict:
for key, values in config_dict["sources"].items():
sources[key] = SourceConfig (
username = values.get("username"),

View File

@ -1,8 +1,9 @@
from grawlix.book import Book, BookData, SingleFile, ImageList, OnlineFile, HtmlFiles
from grawlix.exceptions import GrawlixError
from grawlix.exceptions import GrawlixError, UnsupportedOutputFormat
from grawlix.logging import info
from .output_format import OutputFormat
from .acsm import Acsm
from .cbz import Cbz
from .epub import Epub
@ -16,7 +17,12 @@ async def download_book(book: Book, update_func: Callable, template: str) -> Non
:param book: Book to download
"""
output_format = get_default_format(book.data)
_, ext = os.path.splitext(template)
ext = ext[1:]
if ext in get_valid_extensions():
output_format = find_output_format(book, ext)()
else:
output_format = get_default_format(book)
location = format_output_location(book, output_format, template)
if not book.overwrite and os.path.exists(location):
info("Skipping - File already exists")
@ -24,14 +30,7 @@ async def download_book(book: Book, update_func: Callable, template: str) -> Non
parent = Path(location).parent
if not parent.exists():
os.makedirs(parent)
if isinstance(book.data, SingleFile):
await output_format.dl_single_file(book, location, update_func)
elif isinstance(book.data, ImageList):
await output_format.dl_image_list(book, location, update_func)
elif isinstance(book.data, HtmlFiles):
await output_format.dl_html_files(book, location, update_func)
else:
raise NotImplementedError
await output_format.download(book, location, update_func)
await output_format.close()
@ -48,34 +47,43 @@ def format_output_location(book: Book, output_format: OutputFormat, template: st
return template.format(**values, ext = output_format.extension)
def get_default_format(bookdata: BookData) -> OutputFormat:
def get_default_format(book: Book) -> OutputFormat:
"""
Get default output format for bookdata.
Should only be used if no format was specified by the user
:param bookdata: Content of book
:param book: Content of book
:returns: OutputFormat object matching the default
"""
bookdata = book.data
if isinstance(bookdata, SingleFile):
return output_format_from_str(bookdata.file.extension)
extension = bookdata.file.extension
if isinstance(bookdata, ImageList):
return Cbz()
extension = "cbz"
if isinstance(bookdata, HtmlFiles):
return Epub()
raise GrawlixError
extension = "epub"
output_format = find_output_format(book, extension)
return output_format()
def output_format_from_str(name: str) -> OutputFormat:
def find_output_format(book: Book, extension: str) -> type[OutputFormat]:
"""
Convert string to outputformat object
Find a compatible output format
:param name: Name of output format
:returns: OutputFormat object
:param book: Book to download
:param extension: Extension of output file
:returns: Compatible OutputFormat type
:raises: UnsupportedOutputFormat if nothing is found
"""
for output_format in get_output_formats():
if output_format.extension == name:
return output_format()
raise GrawlixError
matches_extension = output_format.extension == extension
supports_bookdata = type(book.data) in output_format.input_types
if matches_extension and supports_bookdata:
return output_format
raise UnsupportedOutputFormat
def get_valid_extensions() -> list[str]:
return [output_format.extension for output_format in get_output_formats()]
def get_output_formats() -> list[type[OutputFormat]]:
@ -85,6 +93,7 @@ def get_output_formats() -> list[type[OutputFormat]]:
:returns: List of available output format classes
"""
return [
Acsm,
Cbz,
Epub,
]

23
grawlix/output/acsm.py Normal file
View File

@ -0,0 +1,23 @@
from grawlix.book import Book, SingleFile
from .output_format import OutputFormat, Update
import shutil
import subprocess
class Acsm(OutputFormat):
extension = "acsm"
input_types = [SingleFile]
async def download(self, book: Book, location: str, update_func: Update) -> None:
# Download and write acsm file to disk
await self._download_single_file(book, location, update_func)
# TODO: Implement more general solution
# Decrypt if knock is available
# https://web.archive.org/web/20221016154220/https://github.com/BentonEdmondson/knock
if shutil.which("knock") is not None:
subprocess.run(
["knock", location],
capture_output = True
)
else:
# TODO: Print warning
pass

View File

@ -9,18 +9,21 @@ class Cbz(OutputFormat):
"""Comic book zip file"""
extension: str = "cbz"
input_types = [ImageList]
async def dl_image_list(self, book: Book, location: str, update: Update) -> None:
async def download(self, book: Book, location: str, update: Update) -> None:
if not isinstance(book.data, ImageList):
raise UnsupportedOutputFormat
semaphore = asyncio.Semaphore(10)
images = book.data.images
image_count = len(images)
with ZipFile(location, mode="w") as zip:
async def download_page(index: int, file: OnlineFile):
content = await self._download_file(file)
zip.writestr(f"Image {index}.{file.extension}", content)
if update:
update(1/image_count)
async with semaphore:
content = await self._download_file(file)
zip.writestr(f"Image {index}.{file.extension}", content)
if update:
update(1/image_count)
tasks = [
asyncio.create_task(download_page(index, file))
for index, file in enumerate(images)

View File

@ -1,4 +1,4 @@
from grawlix.book import HtmlFiles, HtmlFile, OnlineFile, Book
from grawlix.book import HtmlFiles, HtmlFile, OnlineFile, Book, SingleFile, Metadata
from grawlix.exceptions import UnsupportedOutputFormat
from .output_format import OutputFormat, Update
@ -9,14 +9,20 @@ from ebooklib import epub
class Epub(OutputFormat):
extension = "epub"
input_types = [SingleFile, HtmlFiles]
async def dl_html_files(self, book: Book, location: str, update: Update) -> None:
if not isinstance(book.data, HtmlFiles):
async def download(self, book: Book, location: str, update: Update) -> None:
if isinstance(book.data, SingleFile):
await self._download_single_file(book, location, update)
elif isinstance(book.data, HtmlFiles):
await self._download_html_files(book.data, book.metadata, location, update)
else:
raise UnsupportedOutputFormat
html = book.data
async def _download_html_files(self, html: HtmlFiles, metadata: Metadata, location: str, update: Update) -> None:
output = epub.EpubBook()
output.set_title(book.metadata.title)
for author in book.metadata.authors:
output.set_title(metadata.title)
for author in metadata.authors:
output.add_author(author)
file_count = len(html.htmlfiles) + 1 # Html files + cover

View File

@ -1,4 +1,4 @@
from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book
from grawlix.book import Book, SingleFile, OnlineFile, ImageList, HtmlFiles, Book, OfflineFile, BookData
from grawlix.exceptions import UnsupportedOutputFormat
from grawlix.encryption import decrypt
@ -9,7 +9,8 @@ Update = Optional[Callable[[float], None]]
class OutputFormat:
# Extension for output files
extension: str = ""
extension: str
input_types: list[type[BookData]]
def __init__(self) -> None:
self._client = httpx.AsyncClient()
@ -20,7 +21,18 @@ class OutputFormat:
await self._client.aclose()
async def dl_single_file(self, book: Book, location: str, update_func: Update) -> None:
async def download(self, book: Book, location: str, update_func: Update) -> None:
"""
Download book
:param book: Book to download
:param location: Path to where the file is written
:param update_func: Function to update progress bar
"""
raise UnsupportedOutputFormat
async def _download_single_file(self, book: Book, location: str, update_func: Update) -> None:
"""
Download and write an `grawlix.SingleFile` to disk
@ -32,29 +44,10 @@ class OutputFormat:
raise UnsupportedOutputFormat
if not book.data.file.extension == self.extension:
raise UnsupportedOutputFormat
await self._download_and_write_file(book.data.file, location, update_func)
async def dl_image_list(self, book: Book, location: str, update_func: Update) -> None:
"""
Download and write an `grawlix.ImageList` to disk
:param book: Book to download
:param location: Path to where the file is written
:raises UnsupportedOutputFormat: If datatype is not supported by format
"""
raise UnsupportedOutputFormat
async def dl_html_files(self, book: Book, location: str, update_func: Update) -> None:
"""
Download and write a `grawlix.HtmlFiles` to disk
:param book: Book to download
:param location: Path to where the file is written
:raises UnsupportedOutputFormat: If datatype is not supported by format
"""
raise UnsupportedOutputFormat
if isinstance(book.data.file, OnlineFile):
await self._download_and_write_file(book.data.file, location, update_func)
elif isinstance(book.data.file, OfflineFile):
self._write_offline_file(book.data.file, location)
async def _download_file(self, file: OnlineFile, update: Update = None) -> bytes:
@ -88,3 +81,17 @@ class OutputFormat:
content = await self._download_file(file, update)
with open(location, "wb") as f:
f.write(content)
def _write_offline_file(self, file: OfflineFile, location: str) -> None:
"""
Write the content of an `OfflineFile` to disk
:param file: File to write to disk
:param location: Path to where the file is written
"""
with open(location, "wb") as f:
content = file.content
if file.encryption:
content = decrypt(content, file.encryption)
f.write(content)

View File

@ -3,6 +3,7 @@ from grawlix.exceptions import NoSourceFound
from .source import Source
from .ereolen import Ereolen
from .flipp import Flipp
from .internet_archive import InternetArchive
from .mangaplus import MangaPlus
from .royal_road import RoyalRoad
from .saxo import Saxo
@ -53,6 +54,7 @@ def get_source_classes() -> list[type[Source]]:
return [
Ereolen,
Flipp,
InternetArchive,
MangaPlus,
RoyalRoad,
Saxo,

View File

@ -2,8 +2,8 @@ from grawlix.book import Result, Book, SingleFile, Metadata, OnlineFile
from grawlix.encryption import AESCTREncryption
from grawlix.exceptions import InvalidUrl, DataNotFound
from grawlix.utils import nearest_string
from .source import Source
from bs4 import BeautifulSoup
import json
import re

View File

@ -0,0 +1,111 @@
from grawlix.book import Book, SingleFile, Metadata, OfflineFile
from .source import Source
import random
import string
from bs4 import BeautifulSoup
import asyncio
class InternetArchive(Source):
name: str = "Internet Archive"
match: list[str] = [
r"https://archive.org/details/.+"
]
_authentication_methods = [ "login", "cookies" ]
_login_credentials = [ "username", "password" ]
@staticmethod
def _format_data(content_type: str, fields):
data = ""
for name, value in fields.items():
data += f"--{content_type}\x0d\x0aContent-Disposition: form-data; name=\"{name}\"\x0d\x0a\x0d\x0a{value}\x0d\x0a"
data += content_type+"--"
return data
async def login(self, username: str, password: str, **kwargs) -> None:
await self._client.get("https://archive.org/account/login")
content_type = "----WebKitFormBoundary"+"".join(random.sample(string.ascii_letters + string.digits, 16))
headers = {'Content-Type': 'multipart/form-data; boundary='+content_type}
data = self._format_data(content_type, {"username":username, "password":password, "submit_by_js":"true"})
response = await self._client.post(
"https://archive.org/account/login",
data=data,
headers=headers
)
if not "Successful login" in response.text:
print("Failed login")
exit(1)
async def _download_acsm(self, book_id: str) -> bytes:
"""
Loan book on archive.org and download acsm file
:param book_id: Id of book
"""
await self._client.post(
"https://archive.org/services/loans/loan/searchInside.php",
data = {
"action": "grant_access",
"identifier": book_id
}
)
await self._client.post(
"https://archive.org/services/loans/loan/",
data = {
"action": "browse_book",
"identifier": book_id
}
)
# TODO: Error handling
await self._client.post(
"https://archive.org/services/loans/loan/",
data = {
"action": "create_token",
"identifier": book_id
}
)
acsm_response = await self._client.get(
f"https://archive.org/services/loans/loan/?action=media_url&identifier={book_id}&format=pdf&redirect=1",
follow_redirects = True
)
return acsm_response.content
async def download(self, url: str) -> Book:
book_id = url.split("/")[4]
metadata, acsm_file = await asyncio.gather(
self._download_metadata(book_id),
self._download_acsm(book_id)
)
return Book(
data = SingleFile(
OfflineFile(
content = acsm_file,
extension = "acsm",
)
),
metadata = Metadata(
title = metadata["title"],
authors = [ metadata.get("creator") ] if "creator" in metadata else []
)
)
async def _download_metadata(self, book_id: str) -> dict:
"""
Download metadata for book
:param book_id: Id of book
:returns: Dictionary with metadata
"""
page_response = await self._client.get(
f"https://archive.org/details/{book_id}"
)
soup = BeautifulSoup(page_response.text, "lxml")
metadata_url = soup.find("ia-book-theater").get("bookmanifesturl")
metadata_response = await self._client.get(
f"https:{metadata_url}"
)
return metadata_response.json()["data"]["metadata"]