9 Commits

Author SHA1 Message Date
lamp 77ba5ba2b6 hmm retry 2021-09-23 20:26:06 -07:00
lamp a1c603edcf save ugoira 2021-09-23 19:13:02 -07:00
lamp 704b95823b gridfs meta 2021-09-23 18:03:55 -07:00
lamp 666628dfc7 v2 mongodb version 2021-09-23 17:39:32 -07:00
lamp 79c5a7abe5 Merge branch 'master' of gitea.moe:lamp/pixiv-popular-downloader 2021-09-21 13:12:43 -07:00
lamp edb1af72eb eliminate the need to edit hosts file 2021-09-21 13:10:56 -07:00
lamp 9440fcf740 Update 'README.md' 2021-09-21 06:09:21 -05:00
lamp f5eb46e8a8 refactored on python 2021-09-21 04:04:51 -07:00
lamp f815b730f0 resurrect python script 2021-09-21 03:38:29 -07:00
8 changed files with 175 additions and 2747 deletions
+1 -3
View File
@@ -1,3 +1 @@
node_modules .env
download
chromium_data
-12
View File
@@ -1,12 +0,0 @@
### example usage
```sh
node pixiv-downloader.js "初音ミク" -p 3
```
Chromium window will appear where you'll need to sign in to your pixiv premium account if you haven't already.
### args
- `-p <num>` for number of pages (default 1)
- `-s <num>` for page number to start on (default 1)
- `-r` to include R18
-2631
View File
File diff suppressed because it is too large Load Diff
-1
View File
@@ -1 +0,0 @@
{"dependencies":{"download":"^8.0.0","minimist":"^1.2.5","puppeteer":"^10.2.0"}}
-65
View File
@@ -1,65 +0,0 @@
// node pixiv-downloader.js <tags> -p [numpages] -s [startpage] [-r]
var argv = require("minimist")(process.argv.slice(2), {
default: {
"p": 1,
"s": 1
}
});
var query = argv._.join(" ");
var puppeteer = require("puppeteer");
var download = require("download");
var fs = require("fs");
(async function(){
var browser = await puppeteer.launch({
headless: false,
userDataDir: process.cwd() + "/chromium_data"
});
var page = await browser.newPage();
await page.setContent(`login to <a target="_blank" href="https://accounts.pixiv.net/login">pixiv</a> if necessary, then close this page to continue.`);
await new Promise(r => page.on("close", r));
var page = await browser.newPage();
for (let i = argv.s; i < argv.s + argv.p; i++) {
let url = `https://www.pixiv.net/ajax/search/artworks/${encodeURIComponent(query)}?order=popular_d&mode=${argv.r ? "all" : "safe"}&p=${i}`;
console.log("get page", i, url);
await page.goto(url);
let data = JSON.parse(await page.evaluate(() => document.querySelector("body").innerText));
if (data.error) throw data.message;
for (let item of data.body.illustManga.data) {
let url = `https://www.pixiv.net/ajax/illust/${item.id}/pages`;
console.log("get", url);
await page.goto(url);
let data = JSON.parse(await page.evaluate(() => document.querySelector("body").innerText));
if (data.error) {
console.error(error.message);
} else {
for (let image of data.body) {
let url = image.urls.original;
console.log("download", url);
if (!fs.existsSync("download")) fs.mkdirSync("download");
let filename = url.split('/').pop();
let filedir = "download/" + query + "/";
if (!fs.existsSync(filedir)) fs.mkdirSync(filedir);
let filepath = filedir + filename;
if (fs.existsSync(filepath)) {
console.log(filename, "already exists");
continue;
}
let write = fs.createWriteStream(filepath);
download(url, {
headers: {
"Referer": "https://www.pixiv.net"
}
}).pipe(write);
await new Promise(r => write.on("close", r));
console.log("saved", filename)
}
}
}
}
console.log("complete");
await browser.close();
})();
+169
View File
@@ -0,0 +1,169 @@
import argparse
import requests
from requests_toolbelt.adapters import host_header_ssl
from urllib3.util import Retry
from urllib.parse import quote as encodeURI
import os
from pymongo import MongoClient
from gridfs import GridFS
from datetime import datetime
from dotenv import load_dotenv
load_dotenv()
ap = argparse.ArgumentParser()
ap.add_argument("tag", help="Pixiv tag(s) to search")
ap.add_argument("-p", dest="numpages", type=int, default=1, help="number of pages to download (default 1)")
ap.add_argument("-s", dest="startpagenum", type=int, default=1, help="page number to start at")
args = ap.parse_args()
rqs = requests.Session()
rqs.mount('https://', host_header_ssl.HostHeaderSSLAdapter(max_retries=Retry(total=5, backoff_factor=1)))
dbclient = MongoClient(os.environ["MONGODB_URI"])
db = dbclient["mikudb"]
illustration_collection = db["illustration_collection"]
search_collection = db["search_collection"]
gridfs = GridFS(db)
def download_popular(tag, startpagenum = 1, numpages = 1):
# record this search session in the database
search_document_id = search_collection.insert_one({
"date": datetime.now(), # date started
"query": tag, # the tag being searched
"current_page": startpagenum, # keep track of the page we're on
"current_illust": None, # keep track of which item is being downloaded
"search_data": [], # save each payload
"results": [], # collect ids of all results that were saved
"completed": False # whether this ever reached the end
}).inserted_id
the_id_of_the_first_result_on_the_previous_page = None
for page_number in range(startpagenum, numpages+1):
# download search results
search_url = f"https://210.140.131.219/ajax/search/artworks/{encodeURI(tag, safe='')}?order=popular_d&mode=all&p={page_number}"
print("get", search_url)
search_data = rqs.get(search_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
# save raw search data
search_collection.update_one({"_id": search_document_id}, {"$set": {"current_page": page_number}, "$push": {"search_data": {
"page_number": page_number,
"search_url": search_url,
"search_data": search_data
}}})
if (search_data['error']):
print("error from search api:", search_data['message'])
exit(1)
search_results = search_data['body']['illustManga']['data']
# if there is no data then we are done
if not search_results:
print("No more search results")
search_collection.update_one({"_id": search_document_id}, {"$set":{"completed": True}})
exit()
# But large tags seem to give the last page of results for any page number
if search_results[0]['id'] == the_id_of_the_first_result_on_the_previous_page:
print("Reached duplicate search results, looks like the end")
search_collection.update_one({"_id": search_document_id}, {"$set":{"completed": True}})
exit()
else: the_id_of_the_first_result_on_the_previous_page = search_results[0]['id']
# for each search result
for illust_data_from_search in search_results:
illust_id = illust_data_from_search['id']
search_collection.update_one({"_id": search_document_id}, {"$set":{"current_illust": illust_id}})
# check if this illust was already saved
if illustration_collection.count_documents({"_id": illust_id}):
print("already have", illust_id)
continue
# illust_metadata_from_search has limited information (blank descriptions)
# download full meta data from ajax
illust_ajax_url = f"https://210.140.131.219/ajax/illust/{illust_id}"
print("get", illust_ajax_url)
illust_ajax_data = rqs.get(illust_ajax_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
if (illust_ajax_data['error']):
print("error from ajax api:", illust_ajax_data['message'])
# save animated works
ugoira_data = None
if (illust_ajax_data['body']['illustType'] == 2):
illust_ugoira_url = illust_ajax_url + "/ugoira_meta"
print("get", illust_ugoira_url)
illust_ugoira_data = rqs.get(illust_ugoira_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
if (illust_ugoira_data['error']):
print("error from ajax ugoira api:", illust_ugoira_data['message'])
else:
original_ugoira_url = illust_ugoira_data['body']['originalSrc']
print("get", original_ugoira_url)
res = rqs.get(original_ugoira_url, headers={'referer':'https://www.pixiv.net'})
print("gridfs put", original_ugoira_url)
ugoira_data = {
"gridfs_id": gridfs.put(res.content, filename=original_ugoira_url.split('/').pop(), original_url=original_ugoira_url, ugoira=True),
"ugoira_meta": illust_ugoira_data
}
# illust_ajax_data does not have "page" data (additional image urls)
# download that
illust_pages_url = illust_ajax_url + "/pages"
print("get", illust_pages_url)
illust_pages_data = rqs.get(illust_pages_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
if (illust_pages_data['error']):
print("error from ajax pages api:", illust_pages_data['message'])
else:
# prepare database document
document = {
"_id": illust_id, # use the unique artwork id for document id so we can't have duplicates
"illust_ajax_data": illust_ajax_data, # save all the metadata for the artwork
"illust_pages_data": illust_pages_data, # save all the image urls of the data
"downloaded_images": {}, # map of image filenames to gridfs ids
"ugoira_data": ugoira_data, # animation data
"date_saved": datetime.now()
}
# download originals
for illust_page_data in illust_pages_data['body']:
original_image_url = illust_page_data['urls']['original']
original_image_filename = original_image_url.split('/').pop()
print("get", original_image_url)
res = rqs.get(original_image_url, headers={'referer':'https://www.pixiv.net'})
print("gridfs put", res.url)
gridfs_id = gridfs.put(res.content, filename=original_image_filename, original_url=original_image_url)
document['downloaded_images'][original_image_filename] = gridfs_id
# add to db
illustration_collection.insert_one(document)
search_collection.update_one({"_id": search_document_id}, {"$push": {"results": illust_id}})
search_collection.update_one({"_id": search_document_id}, {"$set":{"completed": True}})
print("end of loop")
download_popular(args.tag, args.startpagenum, args.numpages)
+5
View File
@@ -0,0 +1,5 @@
requests==2.26.0
requests-toolbelt==0.9.1
urllib3==1.26.6
pymongo==3.12.0
python-dotenv==0.19.0
-35
View File
@@ -1,35 +0,0 @@
# simple downloader in python; i couldn't get cookie to work in order to get sorted by popularity so this is only useful for getting results sorted by newest/oldest and non-r18
import argparse
import requests
from urllib.parse import quote as encodeURI
#cookie = open("cookie.txt", 'r').read()
cookie = None
ap = argparse.ArgumentParser()
ap.add_argument("tags")
ap.add_argument("-p", "--pages", type=int, default=1, help="number of pages")
args = ap.parse_args()
for i in range(1, args.pages+1):
url = f"https://www.pixiv.net/ajax/search/artworks/{encodeURI(args.tags, safe='')}?order=popular_d&mode=safe&p={i}"
print("get", url)
data = requests.get(url, headers={cookie: cookie}).json()
if (data['error']):
print(data['message'])
exit(1)
for item in data['body']['illustManga']['data']:
url = f"https://www.pixiv.net/ajax/illust/{item['id']}/pages"
print("get", url)
data = requests.get(url, headers={cookie: cookie}).json()
if (data['error']):
print(data['message'])
else:
for image in data['body']:
url = image['urls']['original']
filename = url.split('/').pop()
print("get", url)
req = requests.get(url, headers={'referer':'https://www.pixiv.net'})
open("download/"+ filename, "wb").write(req.content)
print("saved", filename)