Compare commits
9 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 77ba5ba2b6 | |||
| a1c603edcf | |||
| 704b95823b | |||
| 666628dfc7 | |||
| 79c5a7abe5 | |||
| edb1af72eb | |||
| 9440fcf740 | |||
| f5eb46e8a8 | |||
| f815b730f0 |
+1
-3
@@ -1,3 +1 @@
|
|||||||
node_modules
|
.env
|
||||||
download
|
|
||||||
chromium_data
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
|
|
||||||
### example usage
|
|
||||||
```sh
|
|
||||||
node pixiv-downloader.js "初音ミク" -p 3
|
|
||||||
```
|
|
||||||
|
|
||||||
Chromium window will appear where you'll need to sign in to your pixiv premium account if you haven't already.
|
|
||||||
|
|
||||||
### args
|
|
||||||
- `-p <num>` for number of pages (default 1)
|
|
||||||
- `-s <num>` for page number to start on (default 1)
|
|
||||||
- `-r` to include R18
|
|
||||||
Generated
-2631
File diff suppressed because it is too large
Load Diff
@@ -1 +0,0 @@
|
|||||||
{"dependencies":{"download":"^8.0.0","minimist":"^1.2.5","puppeteer":"^10.2.0"}}
|
|
||||||
@@ -1,65 +0,0 @@
|
|||||||
// node pixiv-downloader.js <tags> -p [numpages] -s [startpage] [-r]
|
|
||||||
var argv = require("minimist")(process.argv.slice(2), {
|
|
||||||
default: {
|
|
||||||
"p": 1,
|
|
||||||
"s": 1
|
|
||||||
}
|
|
||||||
});
|
|
||||||
var query = argv._.join(" ");
|
|
||||||
var puppeteer = require("puppeteer");
|
|
||||||
var download = require("download");
|
|
||||||
var fs = require("fs");
|
|
||||||
|
|
||||||
(async function(){
|
|
||||||
var browser = await puppeteer.launch({
|
|
||||||
headless: false,
|
|
||||||
userDataDir: process.cwd() + "/chromium_data"
|
|
||||||
});
|
|
||||||
|
|
||||||
var page = await browser.newPage();
|
|
||||||
await page.setContent(`login to <a target="_blank" href="https://accounts.pixiv.net/login">pixiv</a> if necessary, then close this page to continue.`);
|
|
||||||
await new Promise(r => page.on("close", r));
|
|
||||||
|
|
||||||
var page = await browser.newPage();
|
|
||||||
for (let i = argv.s; i < argv.s + argv.p; i++) {
|
|
||||||
let url = `https://www.pixiv.net/ajax/search/artworks/${encodeURIComponent(query)}?order=popular_d&mode=${argv.r ? "all" : "safe"}&p=${i}`;
|
|
||||||
console.log("get page", i, url);
|
|
||||||
await page.goto(url);
|
|
||||||
let data = JSON.parse(await page.evaluate(() => document.querySelector("body").innerText));
|
|
||||||
if (data.error) throw data.message;
|
|
||||||
for (let item of data.body.illustManga.data) {
|
|
||||||
let url = `https://www.pixiv.net/ajax/illust/${item.id}/pages`;
|
|
||||||
console.log("get", url);
|
|
||||||
await page.goto(url);
|
|
||||||
let data = JSON.parse(await page.evaluate(() => document.querySelector("body").innerText));
|
|
||||||
if (data.error) {
|
|
||||||
console.error(error.message);
|
|
||||||
} else {
|
|
||||||
for (let image of data.body) {
|
|
||||||
let url = image.urls.original;
|
|
||||||
console.log("download", url);
|
|
||||||
if (!fs.existsSync("download")) fs.mkdirSync("download");
|
|
||||||
let filename = url.split('/').pop();
|
|
||||||
let filedir = "download/" + query + "/";
|
|
||||||
if (!fs.existsSync(filedir)) fs.mkdirSync(filedir);
|
|
||||||
let filepath = filedir + filename;
|
|
||||||
if (fs.existsSync(filepath)) {
|
|
||||||
console.log(filename, "already exists");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
let write = fs.createWriteStream(filepath);
|
|
||||||
download(url, {
|
|
||||||
headers: {
|
|
||||||
"Referer": "https://www.pixiv.net"
|
|
||||||
}
|
|
||||||
}).pipe(write);
|
|
||||||
await new Promise(r => write.on("close", r));
|
|
||||||
console.log("saved", filename)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
console.log("complete");
|
|
||||||
await browser.close();
|
|
||||||
})();
|
|
||||||
|
|
||||||
@@ -0,0 +1,169 @@
|
|||||||
|
import argparse
|
||||||
|
import requests
|
||||||
|
from requests_toolbelt.adapters import host_header_ssl
|
||||||
|
from urllib3.util import Retry
|
||||||
|
from urllib.parse import quote as encodeURI
|
||||||
|
import os
|
||||||
|
from pymongo import MongoClient
|
||||||
|
from gridfs import GridFS
|
||||||
|
from datetime import datetime
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("tag", help="Pixiv tag(s) to search")
|
||||||
|
ap.add_argument("-p", dest="numpages", type=int, default=1, help="number of pages to download (default 1)")
|
||||||
|
ap.add_argument("-s", dest="startpagenum", type=int, default=1, help="page number to start at")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
rqs = requests.Session()
|
||||||
|
rqs.mount('https://', host_header_ssl.HostHeaderSSLAdapter(max_retries=Retry(total=5, backoff_factor=1)))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
dbclient = MongoClient(os.environ["MONGODB_URI"])
|
||||||
|
|
||||||
|
db = dbclient["mikudb"]
|
||||||
|
illustration_collection = db["illustration_collection"]
|
||||||
|
search_collection = db["search_collection"]
|
||||||
|
|
||||||
|
gridfs = GridFS(db)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def download_popular(tag, startpagenum = 1, numpages = 1):
|
||||||
|
|
||||||
|
|
||||||
|
# record this search session in the database
|
||||||
|
search_document_id = search_collection.insert_one({
|
||||||
|
"date": datetime.now(), # date started
|
||||||
|
"query": tag, # the tag being searched
|
||||||
|
"current_page": startpagenum, # keep track of the page we're on
|
||||||
|
"current_illust": None, # keep track of which item is being downloaded
|
||||||
|
"search_data": [], # save each payload
|
||||||
|
"results": [], # collect ids of all results that were saved
|
||||||
|
"completed": False # whether this ever reached the end
|
||||||
|
}).inserted_id
|
||||||
|
|
||||||
|
|
||||||
|
the_id_of_the_first_result_on_the_previous_page = None
|
||||||
|
for page_number in range(startpagenum, numpages+1):
|
||||||
|
# download search results
|
||||||
|
search_url = f"https://210.140.131.219/ajax/search/artworks/{encodeURI(tag, safe='')}?order=popular_d&mode=all&p={page_number}"
|
||||||
|
print("get", search_url)
|
||||||
|
search_data = rqs.get(search_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
|
||||||
|
|
||||||
|
# save raw search data
|
||||||
|
search_collection.update_one({"_id": search_document_id}, {"$set": {"current_page": page_number}, "$push": {"search_data": {
|
||||||
|
"page_number": page_number,
|
||||||
|
"search_url": search_url,
|
||||||
|
"search_data": search_data
|
||||||
|
}}})
|
||||||
|
|
||||||
|
if (search_data['error']):
|
||||||
|
print("error from search api:", search_data['message'])
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
search_results = search_data['body']['illustManga']['data']
|
||||||
|
|
||||||
|
# if there is no data then we are done
|
||||||
|
if not search_results:
|
||||||
|
print("No more search results")
|
||||||
|
search_collection.update_one({"_id": search_document_id}, {"$set":{"completed": True}})
|
||||||
|
exit()
|
||||||
|
|
||||||
|
# But large tags seem to give the last page of results for any page number
|
||||||
|
if search_results[0]['id'] == the_id_of_the_first_result_on_the_previous_page:
|
||||||
|
print("Reached duplicate search results, looks like the end")
|
||||||
|
search_collection.update_one({"_id": search_document_id}, {"$set":{"completed": True}})
|
||||||
|
exit()
|
||||||
|
else: the_id_of_the_first_result_on_the_previous_page = search_results[0]['id']
|
||||||
|
|
||||||
|
|
||||||
|
# for each search result
|
||||||
|
for illust_data_from_search in search_results:
|
||||||
|
illust_id = illust_data_from_search['id']
|
||||||
|
|
||||||
|
search_collection.update_one({"_id": search_document_id}, {"$set":{"current_illust": illust_id}})
|
||||||
|
|
||||||
|
# check if this illust was already saved
|
||||||
|
if illustration_collection.count_documents({"_id": illust_id}):
|
||||||
|
print("already have", illust_id)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# illust_metadata_from_search has limited information (blank descriptions)
|
||||||
|
# download full meta data from ajax
|
||||||
|
illust_ajax_url = f"https://210.140.131.219/ajax/illust/{illust_id}"
|
||||||
|
print("get", illust_ajax_url)
|
||||||
|
illust_ajax_data = rqs.get(illust_ajax_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
|
||||||
|
if (illust_ajax_data['error']):
|
||||||
|
print("error from ajax api:", illust_ajax_data['message'])
|
||||||
|
|
||||||
|
|
||||||
|
# save animated works
|
||||||
|
ugoira_data = None
|
||||||
|
if (illust_ajax_data['body']['illustType'] == 2):
|
||||||
|
illust_ugoira_url = illust_ajax_url + "/ugoira_meta"
|
||||||
|
print("get", illust_ugoira_url)
|
||||||
|
illust_ugoira_data = rqs.get(illust_ugoira_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
|
||||||
|
if (illust_ugoira_data['error']):
|
||||||
|
print("error from ajax ugoira api:", illust_ugoira_data['message'])
|
||||||
|
else:
|
||||||
|
original_ugoira_url = illust_ugoira_data['body']['originalSrc']
|
||||||
|
print("get", original_ugoira_url)
|
||||||
|
res = rqs.get(original_ugoira_url, headers={'referer':'https://www.pixiv.net'})
|
||||||
|
print("gridfs put", original_ugoira_url)
|
||||||
|
ugoira_data = {
|
||||||
|
"gridfs_id": gridfs.put(res.content, filename=original_ugoira_url.split('/').pop(), original_url=original_ugoira_url, ugoira=True),
|
||||||
|
"ugoira_meta": illust_ugoira_data
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# illust_ajax_data does not have "page" data (additional image urls)
|
||||||
|
# download that
|
||||||
|
illust_pages_url = illust_ajax_url + "/pages"
|
||||||
|
print("get", illust_pages_url)
|
||||||
|
illust_pages_data = rqs.get(illust_pages_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
|
||||||
|
if (illust_pages_data['error']):
|
||||||
|
print("error from ajax pages api:", illust_pages_data['message'])
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
# prepare database document
|
||||||
|
document = {
|
||||||
|
"_id": illust_id, # use the unique artwork id for document id so we can't have duplicates
|
||||||
|
"illust_ajax_data": illust_ajax_data, # save all the metadata for the artwork
|
||||||
|
"illust_pages_data": illust_pages_data, # save all the image urls of the data
|
||||||
|
"downloaded_images": {}, # map of image filenames to gridfs ids
|
||||||
|
"ugoira_data": ugoira_data, # animation data
|
||||||
|
"date_saved": datetime.now()
|
||||||
|
}
|
||||||
|
|
||||||
|
# download originals
|
||||||
|
for illust_page_data in illust_pages_data['body']:
|
||||||
|
|
||||||
|
original_image_url = illust_page_data['urls']['original']
|
||||||
|
original_image_filename = original_image_url.split('/').pop()
|
||||||
|
|
||||||
|
print("get", original_image_url)
|
||||||
|
res = rqs.get(original_image_url, headers={'referer':'https://www.pixiv.net'})
|
||||||
|
|
||||||
|
print("gridfs put", res.url)
|
||||||
|
gridfs_id = gridfs.put(res.content, filename=original_image_filename, original_url=original_image_url)
|
||||||
|
document['downloaded_images'][original_image_filename] = gridfs_id
|
||||||
|
|
||||||
|
|
||||||
|
# add to db
|
||||||
|
illustration_collection.insert_one(document)
|
||||||
|
search_collection.update_one({"_id": search_document_id}, {"$push": {"results": illust_id}})
|
||||||
|
|
||||||
|
search_collection.update_one({"_id": search_document_id}, {"$set":{"completed": True}})
|
||||||
|
print("end of loop")
|
||||||
|
|
||||||
|
|
||||||
|
download_popular(args.tag, args.startpagenum, args.numpages)
|
||||||
|
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
requests==2.26.0
|
||||||
|
requests-toolbelt==0.9.1
|
||||||
|
urllib3==1.26.6
|
||||||
|
pymongo==3.12.0
|
||||||
|
python-dotenv==0.19.0
|
||||||
Reference in New Issue
Block a user