148 lines
5.3 KiB
Python
148 lines
5.3 KiB
Python
import argparse
|
|
import requests
|
|
from requests_toolbelt.adapters import host_header_ssl
|
|
from urllib.parse import quote as encodeURI
|
|
import os
|
|
from pymongo import MongoClient
|
|
from gridfs import GridFS
|
|
from datetime import datetime
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("tag", help="Pixiv tag(s) to search")
|
|
ap.add_argument("-p", dest="numpages", type=int, default=1, help="number of pages to download (default 1)")
|
|
ap.add_argument("-s", dest="startpagenum", type=int, default=1, help="page number to start at")
|
|
args = ap.parse_args()
|
|
|
|
|
|
rqs = requests.Session()
|
|
rqs.mount('https://', host_header_ssl.HostHeaderSSLAdapter())
|
|
|
|
|
|
|
|
dbclient = MongoClient(os.environ["MONGODB_URI"])
|
|
|
|
db = dbclient["mikudb"]
|
|
illustration_collection = db["illustration_collection"]
|
|
search_collection = db["search_collection"]
|
|
|
|
gridfs = GridFS(db)
|
|
|
|
|
|
|
|
|
|
def download_popular(tag, startpagenum = 1, numpages = 1):
|
|
|
|
|
|
# record this search session in the database
|
|
search_document_id = search_collection.insert_one({
|
|
"date": datetime.now(), # date started
|
|
"query": None, # the tag being searched
|
|
"current_page": startpagenum, # keep track of the page we're on
|
|
"current_illust": None, # keep track of which item is being downloaded
|
|
"search_data": [], # save each payload
|
|
"results": [], # collect ids of all results that were saved
|
|
"completed": False # whether this ever reached the end
|
|
}).inserted_id
|
|
|
|
|
|
the_id_of_the_first_result_on_the_previous_page = None
|
|
for page_number in range(startpagenum, numpages+1):
|
|
# download search results
|
|
search_url = f"https://210.140.131.219/ajax/search/artworks/{encodeURI(tag, safe='')}?order=popular_d&mode=all&p={page_number}"
|
|
print("get", search_url)
|
|
search_data = rqs.get(search_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
|
|
|
|
# save raw search data
|
|
search_collection.update_one({"_id": search_document_id}, {"$set": {"current_page": page_number}, "$push": {"search_data": {
|
|
"page_number": page_number,
|
|
"search_url": search_url,
|
|
"search_data": search_data
|
|
}}})
|
|
|
|
if (search_data['error']):
|
|
print("error from search api:", search_data['message'])
|
|
exit(1)
|
|
|
|
search_results = search_data['body']['illustManga']['data']
|
|
|
|
# if there is no data then we are done
|
|
if not search_results:
|
|
print("No more search results")
|
|
search_collection.update_one({"_id": search_document_id}, {"$set":{"completed": True}})
|
|
exit()
|
|
|
|
# But large tags seem to give the last page of results for any page number
|
|
if search_results[0]['id'] == the_id_of_the_first_result_on_the_previous_page:
|
|
print("Reached duplicate search results, looks like the end")
|
|
search_collection.update_one({"_id": search_document_id}, {"$set":{"completed": True}})
|
|
exit()
|
|
else: the_id_of_the_first_result_on_the_previous_page = search_results[0]['id']
|
|
|
|
|
|
# for each search result
|
|
for illust_data_from_search in search_results:
|
|
illust_id = illust_data_from_search['id']
|
|
|
|
search_collection.update_one({"_id": search_document_id}, {"$set":{"current_illust": illust_id}})
|
|
|
|
# check if this illust was already saved
|
|
if illustration_collection.count_documents({"_id": illust_id}):
|
|
print("already have", illust_id)
|
|
continue
|
|
|
|
# illust_metadata_from_search has limited information (blank descriptions)
|
|
# download full meta data from ajax
|
|
illust_ajax_url = f"https://210.140.131.219/ajax/illust/{illust_id}"
|
|
print("get", illust_ajax_url)
|
|
illust_ajax_data = rqs.get(illust_ajax_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
|
|
if (illust_ajax_data['error']):
|
|
print("error from ajax api:", illust_ajax_data['message'])
|
|
|
|
# illust_ajax_data does not have "page" data (additional image urls)
|
|
# download that
|
|
illust_pages_url = illust_ajax_url + "/pages"
|
|
print("get", illust_pages_url)
|
|
illust_pages_data = rqs.get(illust_pages_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
|
|
if (illust_pages_data['error']):
|
|
print("error from ajax pages api:", illust_pages_data['message'])
|
|
|
|
else:
|
|
|
|
# prepare database document
|
|
document = {
|
|
"_id": illust_id, # use the unique artwork id for document id so we can't have duplicates
|
|
"illust_ajax_data": illust_ajax_data, # save all the metadata for the artwork
|
|
"illist_pages_data": illust_pages_data, # save all the image urls of the data
|
|
"downloaded_images": {}, # map of image filenames to gridfs ids
|
|
"date_saved": datetime.now()
|
|
}
|
|
|
|
# download originals
|
|
for illust_page_data in illust_pages_data['body']:
|
|
|
|
original_image_url = illust_page_data['urls']['original']
|
|
original_image_filename = original_image_url.split('/').pop()
|
|
|
|
print("get", original_image_url)
|
|
res = rqs.get(original_image_url, headers={'referer':'https://www.pixiv.net'})
|
|
|
|
print("gridfs put", res.url)
|
|
gridfs_id = gridfs.put(res.content)
|
|
document['downloaded_images'][original_image_filename] = gridfs_id
|
|
|
|
|
|
# add to db
|
|
illustration_collection.insert_one(document)
|
|
search_collection.update_one({"_id": search_document_id}, {"$push": {"results":{
|
|
"id": illust_id
|
|
}}})
|
|
|
|
search_collection.update_one({"_id": search_document_id}, {"$set":{"completed": True}})
|
|
print("end of loop")
|
|
|
|
|
|
download_popular(args.tag, args.startpagenum, args.numpages)
|
|
|