170 lines
		
	
	
		
			6.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			170 lines
		
	
	
		
			6.4 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import argparse
 | 
						|
import requests
 | 
						|
from requests_toolbelt.adapters import host_header_ssl
 | 
						|
from urllib3.util import Retry
 | 
						|
from urllib.parse import quote as encodeURI
 | 
						|
import os
 | 
						|
from pymongo import MongoClient
 | 
						|
from gridfs import GridFS
 | 
						|
from datetime import datetime
 | 
						|
from dotenv import load_dotenv
 | 
						|
load_dotenv()
 | 
						|
 | 
						|
ap = argparse.ArgumentParser()
 | 
						|
ap.add_argument("tag", help="Pixiv tag(s) to search")
 | 
						|
ap.add_argument("-p", dest="numpages", type=int, default=1, help="number of pages to download (default 1)")
 | 
						|
ap.add_argument("-s", dest="startpagenum", type=int, default=1, help="page number to start at")
 | 
						|
args = ap.parse_args()
 | 
						|
 | 
						|
 | 
						|
rqs = requests.Session()
 | 
						|
rqs.mount('https://', host_header_ssl.HostHeaderSSLAdapter(max_retries=Retry(total=5, backoff_factor=1)))
 | 
						|
 | 
						|
 | 
						|
 | 
						|
dbclient = MongoClient(os.environ["MONGODB_URI"])
 | 
						|
 | 
						|
db = dbclient["mikudb"]
 | 
						|
illustration_collection = db["illustration_collection"]
 | 
						|
search_collection = db["search_collection"]
 | 
						|
 | 
						|
gridfs = GridFS(db)
 | 
						|
 | 
						|
 | 
						|
 | 
						|
 | 
						|
def download_popular(tag, startpagenum = 1, numpages = 1):
 | 
						|
	
 | 
						|
 | 
						|
	# record this search session in the database
 | 
						|
	search_document_id = search_collection.insert_one({
 | 
						|
		"date": datetime.now(), # date started
 | 
						|
		"query": tag, # the tag being searched
 | 
						|
		"current_page": startpagenum, # keep track of the page we're on
 | 
						|
		"current_illust": None, # keep track of which item is being downloaded
 | 
						|
		"search_data": [], # save each payload
 | 
						|
		"results": [], # collect ids of all results that were saved
 | 
						|
		"completed": False # whether this ever reached the end
 | 
						|
	}).inserted_id
 | 
						|
 | 
						|
 | 
						|
	the_id_of_the_first_result_on_the_previous_page = None
 | 
						|
	for page_number in range(startpagenum, numpages+1):
 | 
						|
		# download search results
 | 
						|
		search_url = f"https://210.140.131.219/ajax/search/artworks/{encodeURI(tag, safe='')}?order=popular_d&mode=all&p={page_number}"
 | 
						|
		print("get", search_url)
 | 
						|
		search_data = rqs.get(search_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
 | 
						|
		
 | 
						|
		# save raw search data
 | 
						|
		search_collection.update_one({"_id": search_document_id}, {"$set": {"current_page": page_number}, "$push": {"search_data": {
 | 
						|
			"page_number": page_number,
 | 
						|
			"search_url": search_url,
 | 
						|
			"search_data": search_data
 | 
						|
		}}})
 | 
						|
 | 
						|
		if (search_data['error']):
 | 
						|
			print("error from search api:", search_data['message'])
 | 
						|
			exit(1)
 | 
						|
 | 
						|
		search_results = search_data['body']['illustManga']['data']
 | 
						|
 | 
						|
		# if there is no data then we are done
 | 
						|
		if not search_results:
 | 
						|
			print("No more search results")
 | 
						|
			search_collection.update_one({"_id": search_document_id}, {"$set":{"completed": True}})
 | 
						|
			exit()
 | 
						|
 | 
						|
		# But large tags seem to give the last page of results for any page number
 | 
						|
		if search_results[0]['id'] == the_id_of_the_first_result_on_the_previous_page:
 | 
						|
			print("Reached duplicate search results, looks like the end")
 | 
						|
			search_collection.update_one({"_id": search_document_id}, {"$set":{"completed": True}})
 | 
						|
			exit()
 | 
						|
		else: the_id_of_the_first_result_on_the_previous_page = search_results[0]['id']
 | 
						|
 | 
						|
 | 
						|
		# for each search result
 | 
						|
		for illust_data_from_search in search_results:
 | 
						|
			illust_id = illust_data_from_search['id']
 | 
						|
 | 
						|
			search_collection.update_one({"_id": search_document_id}, {"$set":{"current_illust": illust_id}})
 | 
						|
 | 
						|
			# check if this illust was already saved
 | 
						|
			if illustration_collection.count_documents({"_id": illust_id}):
 | 
						|
				print("already have", illust_id)
 | 
						|
				continue
 | 
						|
 | 
						|
			# illust_metadata_from_search has limited information (blank descriptions)
 | 
						|
			# download full meta data from ajax
 | 
						|
			illust_ajax_url = f"https://210.140.131.219/ajax/illust/{illust_id}"
 | 
						|
			print("get", illust_ajax_url)
 | 
						|
			illust_ajax_data = rqs.get(illust_ajax_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
 | 
						|
			if (illust_ajax_data['error']):
 | 
						|
				print("error from ajax api:", illust_ajax_data['message'])
 | 
						|
 | 
						|
 | 
						|
			# save animated works
 | 
						|
			ugoira_data = None
 | 
						|
			if (illust_ajax_data['body']['illustType'] == 2):
 | 
						|
				illust_ugoira_url = illust_ajax_url + "/ugoira_meta"
 | 
						|
				print("get", illust_ugoira_url)
 | 
						|
				illust_ugoira_data = rqs.get(illust_ugoira_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
 | 
						|
				if (illust_ugoira_data['error']):
 | 
						|
					print("error from ajax ugoira api:", illust_ugoira_data['message'])
 | 
						|
				else:
 | 
						|
					original_ugoira_url = illust_ugoira_data['body']['originalSrc']
 | 
						|
					print("get", original_ugoira_url)
 | 
						|
					res = rqs.get(original_ugoira_url, headers={'referer':'https://www.pixiv.net'})
 | 
						|
					print("gridfs put", original_ugoira_url)
 | 
						|
					ugoira_data = {
 | 
						|
						"gridfs_id": gridfs.put(res.content, filename=original_ugoira_url.split('/').pop(), original_url=original_ugoira_url, ugoira=True),
 | 
						|
						"ugoira_meta": illust_ugoira_data
 | 
						|
					}
 | 
						|
 | 
						|
 | 
						|
				
 | 
						|
 | 
						|
			# illust_ajax_data does not have "page" data (additional image urls)
 | 
						|
			# download that
 | 
						|
			illust_pages_url = illust_ajax_url + "/pages"
 | 
						|
			print("get", illust_pages_url)
 | 
						|
			illust_pages_data = rqs.get(illust_pages_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
 | 
						|
			if (illust_pages_data['error']):
 | 
						|
				print("error from ajax pages api:", illust_pages_data['message'])
 | 
						|
 | 
						|
			else:
 | 
						|
				
 | 
						|
				# prepare database document
 | 
						|
				document = {
 | 
						|
					"_id": illust_id, # use the unique artwork id for document id so we can't have duplicates
 | 
						|
					"illust_ajax_data": illust_ajax_data, # save all the metadata for the artwork
 | 
						|
					"illust_pages_data": illust_pages_data, # save all the image urls of the data
 | 
						|
					"downloaded_images": {}, # map of image filenames to gridfs ids
 | 
						|
					"ugoira_data": ugoira_data, # animation data
 | 
						|
					"date_saved": datetime.now()
 | 
						|
				}
 | 
						|
				
 | 
						|
				# download originals
 | 
						|
				for illust_page_data in illust_pages_data['body']:
 | 
						|
 | 
						|
					original_image_url = illust_page_data['urls']['original']
 | 
						|
					original_image_filename = original_image_url.split('/').pop()
 | 
						|
 | 
						|
					print("get", original_image_url)
 | 
						|
					res = rqs.get(original_image_url, headers={'referer':'https://www.pixiv.net'})
 | 
						|
 | 
						|
					print("gridfs put", res.url)
 | 
						|
					gridfs_id = gridfs.put(res.content, filename=original_image_filename, original_url=original_image_url)
 | 
						|
					document['downloaded_images'][original_image_filename] = gridfs_id
 | 
						|
 | 
						|
 | 
						|
				# add to db
 | 
						|
				illustration_collection.insert_one(document)
 | 
						|
				search_collection.update_one({"_id": search_document_id}, {"$push": {"results": illust_id}})
 | 
						|
 | 
						|
	search_collection.update_one({"_id": search_document_id}, {"$set":{"completed": True}})
 | 
						|
	print("end of loop")
 | 
						|
 | 
						|
 | 
						|
download_popular(args.tag, args.startpagenum, args.numpages)
 | 
						|
 |