Compare commits

...

4 Commits

Author SHA1 Message Date
Lamp 77ba5ba2b6 hmm retry 2021-09-23 20:26:06 -07:00
Lamp a1c603edcf save ugoira 2021-09-23 19:13:02 -07:00
Lamp 704b95823b gridfs meta 2021-09-23 18:03:55 -07:00
Lamp 666628dfc7 v2 mongodb version 2021-09-23 17:39:32 -07:00
9 changed files with 157 additions and 2761 deletions

3
.gitignore vendored
View File

@ -1,2 +1 @@
PHPSESSID.txt .env
download

View File

@ -1,11 +0,0 @@
This is a python script for downloading original pixiv images from popular search results via a premium account.
# Instructions
1. Download this repo to your computer of course, and open the terminal in it. Run `pip install -r requirements.txt` if necessary.
2. In your browser, on Pixiv logged in to a premium account, in dev tools Application tab, copy the **value** of the `PHPSESSID` cookie, and paste it into a new file named `PHPSESSID.txt` in this folder.
3. Run `python pixiv-popular-downloader.py -h` for usage information. Example usage to download 10 pages of 初音ミク tag, including r18: `python pixiv-popular-downloader.py -r -p 10 "初音ミク"`
4. Check the download folder. If you're getting newest results instead of popular results, then your PHPSESSID failed to work.

3
junk/.gitignore vendored
View File

@ -1,3 +0,0 @@
node_modules
download
chromium_data

View File

@ -1,12 +0,0 @@
### example usage
```sh
node pixiv-downloader.js "初音ミク" -p 3
```
Chromium window will appear where you'll need to sign in to your pixiv premium account if you haven't already.
### args
- `-p <num>` for number of pages (default 1)
- `-s <num>` for page number to start on (default 1)
- `-r` to include R18

2631
junk/package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -1 +0,0 @@
{"dependencies":{"download":"^8.0.0","minimist":"^1.2.5","puppeteer":"^10.2.0"}}

View File

@ -1,65 +0,0 @@
// node pixiv-downloader.js <tags> -p [numpages] -s [startpage] [-r]
var argv = require("minimist")(process.argv.slice(2), {
default: {
"p": 1,
"s": 1
}
});
var query = argv._.join(" ");
var puppeteer = require("puppeteer");
var download = require("download");
var fs = require("fs");
(async function(){
var browser = await puppeteer.launch({
headless: false,
userDataDir: process.cwd() + "/chromium_data"
});
var page = await browser.newPage();
await page.setContent(`login to <a target="_blank" href="https://accounts.pixiv.net/login">pixiv</a> if necessary, then close this page to continue.`);
await new Promise(r => page.on("close", r));
var page = await browser.newPage();
for (let i = argv.s; i < argv.s + argv.p; i++) {
let url = `https://www.pixiv.net/ajax/search/artworks/${encodeURIComponent(query)}?order=popular_d&mode=${argv.r ? "all" : "safe"}&p=${i}`;
console.log("get page", i, url);
await page.goto(url);
let data = JSON.parse(await page.evaluate(() => document.querySelector("body").innerText));
if (data.error) throw data.message;
for (let item of data.body.illustManga.data) {
let url = `https://www.pixiv.net/ajax/illust/${item.id}/pages`;
console.log("get", url);
await page.goto(url);
let data = JSON.parse(await page.evaluate(() => document.querySelector("body").innerText));
if (data.error) {
console.error(error.message);
} else {
for (let image of data.body) {
let url = image.urls.original;
console.log("download", url);
if (!fs.existsSync("download")) fs.mkdirSync("download");
let filename = url.split('/').pop();
let filedir = "download/" + query + "/";
if (!fs.existsSync(filedir)) fs.mkdirSync(filedir);
let filepath = filedir + filename;
if (fs.existsSync(filepath)) {
console.log(filename, "already exists");
continue;
}
let write = fs.createWriteStream(filepath);
download(url, {
headers: {
"Referer": "https://www.pixiv.net"
}
}).pipe(write);
await new Promise(r => write.on("close", r));
console.log("saved", filename)
}
}
}
}
console.log("complete");
await browser.close();
})();

View File

@ -1,51 +1,169 @@
import argparse import argparse
import requests import requests
from requests_toolbelt.adapters import host_header_ssl from requests_toolbelt.adapters import host_header_ssl
from urllib3.util import Retry
from urllib.parse import quote as encodeURI from urllib.parse import quote as encodeURI
import os import os
from pymongo import MongoClient
from gridfs import GridFS
from datetime import datetime
from dotenv import load_dotenv
load_dotenv()
ap = argparse.ArgumentParser() ap = argparse.ArgumentParser()
ap.add_argument("tag", help="Pixiv tag(s) to search") ap.add_argument("tag", help="Pixiv tag(s) to search")
ap.add_argument("-p", dest="numpages", type=int, default=1, help="number of pages to download (default 1)") ap.add_argument("-p", dest="numpages", type=int, default=1, help="number of pages to download (default 1)")
ap.add_argument("-s", dest="startpagenum", type=int, default=1, help="page number to start at") ap.add_argument("-s", dest="startpagenum", type=int, default=1, help="page number to start at")
ap.add_argument("-r", action='store_true', help="include r18 posts")
args = ap.parse_args() args = ap.parse_args()
PHPSESSID = None
with open("PHPSESSID.txt", 'r') as f:
PHPSESSID = f.read()
rqs = requests.Session() rqs = requests.Session()
rqs.mount('https://', host_header_ssl.HostHeaderSSLAdapter()) rqs.mount('https://', host_header_ssl.HostHeaderSSLAdapter(max_retries=Retry(total=5, backoff_factor=1)))
dbclient = MongoClient(os.environ["MONGODB_URI"])
db = dbclient["mikudb"]
illustration_collection = db["illustration_collection"]
search_collection = db["search_collection"]
gridfs = GridFS(db)
def download_popular(tag, startpagenum = 1, numpages = 1):
# record this search session in the database
search_document_id = search_collection.insert_one({
"date": datetime.now(), # date started
"query": tag, # the tag being searched
"current_page": startpagenum, # keep track of the page we're on
"current_illust": None, # keep track of which item is being downloaded
"search_data": [], # save each payload
"results": [], # collect ids of all results that were saved
"completed": False # whether this ever reached the end
}).inserted_id
the_id_of_the_first_result_on_the_previous_page = None
for page_number in range(startpagenum, numpages+1):
# download search results
search_url = f"https://210.140.131.219/ajax/search/artworks/{encodeURI(tag, safe='')}?order=popular_d&mode=all&p={page_number}"
print("get", search_url)
search_data = rqs.get(search_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
# save raw search data
search_collection.update_one({"_id": search_document_id}, {"$set": {"current_page": page_number}, "$push": {"search_data": {
"page_number": page_number,
"search_url": search_url,
"search_data": search_data
}}})
if (search_data['error']):
print("error from search api:", search_data['message'])
exit(1)
search_results = search_data['body']['illustManga']['data']
# if there is no data then we are done
if not search_results:
print("No more search results")
search_collection.update_one({"_id": search_document_id}, {"$set":{"completed": True}})
exit()
# But large tags seem to give the last page of results for any page number
if search_results[0]['id'] == the_id_of_the_first_result_on_the_previous_page:
print("Reached duplicate search results, looks like the end")
search_collection.update_one({"_id": search_document_id}, {"$set":{"completed": True}})
exit()
else: the_id_of_the_first_result_on_the_previous_page = search_results[0]['id']
# for each search result
for illust_data_from_search in search_results:
illust_id = illust_data_from_search['id']
search_collection.update_one({"_id": search_document_id}, {"$set":{"current_illust": illust_id}})
# check if this illust was already saved
if illustration_collection.count_documents({"_id": illust_id}):
print("already have", illust_id)
continue
# illust_metadata_from_search has limited information (blank descriptions)
# download full meta data from ajax
illust_ajax_url = f"https://210.140.131.219/ajax/illust/{illust_id}"
print("get", illust_ajax_url)
illust_ajax_data = rqs.get(illust_ajax_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
if (illust_ajax_data['error']):
print("error from ajax api:", illust_ajax_data['message'])
# save animated works
ugoira_data = None
if (illust_ajax_data['body']['illustType'] == 2):
illust_ugoira_url = illust_ajax_url + "/ugoira_meta"
print("get", illust_ugoira_url)
illust_ugoira_data = rqs.get(illust_ugoira_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
if (illust_ugoira_data['error']):
print("error from ajax ugoira api:", illust_ugoira_data['message'])
else:
original_ugoira_url = illust_ugoira_data['body']['originalSrc']
print("get", original_ugoira_url)
res = rqs.get(original_ugoira_url, headers={'referer':'https://www.pixiv.net'})
print("gridfs put", original_ugoira_url)
ugoira_data = {
"gridfs_id": gridfs.put(res.content, filename=original_ugoira_url.split('/').pop(), original_url=original_ugoira_url, ugoira=True),
"ugoira_meta": illust_ugoira_data
}
# illust_ajax_data does not have "page" data (additional image urls)
# download that
illust_pages_url = illust_ajax_url + "/pages"
print("get", illust_pages_url)
illust_pages_data = rqs.get(illust_pages_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
if (illust_pages_data['error']):
print("error from ajax pages api:", illust_pages_data['message'])
else:
# prepare database document
document = {
"_id": illust_id, # use the unique artwork id for document id so we can't have duplicates
"illust_ajax_data": illust_ajax_data, # save all the metadata for the artwork
"illust_pages_data": illust_pages_data, # save all the image urls of the data
"downloaded_images": {}, # map of image filenames to gridfs ids
"ugoira_data": ugoira_data, # animation data
"date_saved": datetime.now()
}
# download originals
for illust_page_data in illust_pages_data['body']:
original_image_url = illust_page_data['urls']['original']
original_image_filename = original_image_url.split('/').pop()
print("get", original_image_url)
res = rqs.get(original_image_url, headers={'referer':'https://www.pixiv.net'})
print("gridfs put", res.url)
gridfs_id = gridfs.put(res.content, filename=original_image_filename, original_url=original_image_url)
document['downloaded_images'][original_image_filename] = gridfs_id
# add to db
illustration_collection.insert_one(document)
search_collection.update_one({"_id": search_document_id}, {"$push": {"results": illust_id}})
search_collection.update_one({"_id": search_document_id}, {"$set":{"completed": True}})
print("end of loop")
download_popular(args.tag, args.startpagenum, args.numpages)
download_count = 1
for i in range(args.startpagenum, args.numpages+1):
page_url = f"https://210.140.131.219/ajax/search/artworks/{encodeURI(args.tag, safe='')}?order=popular_d&mode={'all' if args.r else 'safe'}&p={i}"
print("get", page_url)
page_data = rqs.get(page_url, cookies={"PHPSESSID": PHPSESSID}, headers={"host":"www.pixiv.net"}).json()
if (page_data['error']):
print(page_data['message'])
exit(1)
for illust in page_data['body']['illustManga']['data']:
illust_r18 = bool(illust['xRestrict'])
illust_url = f"https://210.140.131.219/ajax/illust/{illust['id']}/pages"
print("get", illust_url)
illust_data = rqs.get(illust_url, headers={"host":"www.pixiv.net"}).json()
if (illust_data['error']):
print(illust_data['message'])
else:
for image in illust_data['body']:
image_url = image['urls']['original']
download_dir = f"download/{args.tag}/"
os.makedirs(download_dir, exist_ok=True)
download_filename = str(download_count) + '_' + ('x_' if illust_r18 else '') + image_url.split('/').pop()
download_path = download_dir + download_filename
if os.path.exists(download_path):
print(download_path, "already exists")
continue
print("get", image_url)
res = rqs.get(image_url, headers={'referer':'https://www.pixiv.net'})
with open(download_path, "wb") as f:
f.write(res.content)
print("saved", download_filename)
download_count = download_count + 1

View File

@ -1,3 +1,5 @@
requests==2.26.0 requests==2.26.0
requests-toolbelt==0.9.1 requests-toolbelt==0.9.1
urllib3==1.26.6 urllib3==1.26.6
pymongo==3.12.0
python-dotenv==0.19.0