delete junk

i branched
2021-09-24 10:53:14 -07:00
4 changed files with 48 additions and 155 deletions
@@ -1 +1,2 @@
-.env
+PHPSESSID.txt
 download
@@ -0,0 +1,11 @@
 This is a python script for downloading original pixiv images from popular search results via a premium account.
 # Instructions
 1. Download this repo to your computer of course, and open the terminal in it. Run `pip install -r requirements.txt` if necessary.
 2. In your browser, on Pixiv logged in to a premium account, in dev tools Application tab, copy the **value** of the `PHPSESSID` cookie, and paste it into a new file named `PHPSESSID.txt` in this folder.
 3. Run `python pixiv-popular-downloader.py -h` for usage information. Example usage to download 10 pages of 初音ミク tag, including r18: `python pixiv-popular-downloader.py -r -p 10 "初音ミク"`
 4. Check the download folder. If you're getting newest results instead of popular results, then your PHPSESSID failed to work.
@@ -3,166 +3,49 @@ import requests
 from requests_toolbelt.adapters import host_header_ssl
 from urllib.parse import quote as encodeURI
 import os
 from pymongo import MongoClient
 from gridfs import GridFS
 from datetime import datetime
 from dotenv import load_dotenv
 load_dotenv()
 ap = argparse.ArgumentParser()
 ap.add_argument("tag", help="Pixiv tag(s) to search")
 ap.add_argument("-p", dest="numpages", type=int, default=1, help="number of pages to download (default 1)")
 ap.add_argument("-s", dest="startpagenum", type=int, default=1, help="page number to start at")
 ap.add_argument("-r", action='store_true', help="include r18 posts")
 args = ap.parse_args()
 PHPSESSID = None
 with open("PHPSESSID.txt", 'r') as f:
 	PHPSESSID = f.read()
 rqs = requests.Session()
 rqs.mount('https://', host_header_ssl.HostHeaderSSLAdapter())
-
+download_count = 1
-
+for i in range(args.startpagenum, args.numpages+1):
-dbclient = MongoClient(os.environ["MONGODB_URI"])
+	page_url = f"https://210.140.131.219/ajax/search/artworks/{encodeURI(args.tag, safe='')}?order=popular_d&mode={'all' if args.r else 'safe'}&p={i}"
-
+	print("get", page_url)
-db = dbclient["mikudb"]
+	page_data = rqs.get(page_url, cookies={"PHPSESSID": PHPSESSID}, headers={"host":"www.pixiv.net"}).json()
-illustration_collection = db["illustration_collection"]
+	if (page_data['error']):
-search_collection = db["search_collection"]
+		print(page_data['message'])
-
+		exit(1)
-gridfs = GridFS(db)
+	for illust in page_data['body']['illustManga']['data']:
-
+		illust_r18 = bool(illust['xRestrict'])
-
+		illust_url = f"https://210.140.131.219/ajax/illust/{illust['id']}/pages"
-
+		print("get", illust_url)
-
+		illust_data = rqs.get(illust_url, headers={"host":"www.pixiv.net"}).json()
-def download_popular(tag, startpagenum = 1, numpages = 1):
+		if (illust_data['error']):
-	
+			print(illust_data['message'])
-
+		else:
-	# record this search session in the database
+			for image in illust_data['body']:
-	search_document_id = search_collection.insert_one({
+				image_url = image['urls']['original']
-		"date": datetime.now(), # date started
+				download_dir = f"download/{args.tag}/"
-		"query": tag, # the tag being searched
+				os.makedirs(download_dir, exist_ok=True)
-		"current_page": startpagenum, # keep track of the page we're on
+				download_filename = str(download_count) + '_' + ('x_' if illust_r18 else '') + image_url.split('/').pop()
-		"current_illust": None, # keep track of which item is being downloaded
+				download_path = download_dir + download_filename
-		"search_data": [], # save each payload
+				if os.path.exists(download_path):
-		"results": [], # collect ids of all results that were saved
+					print(download_path, "already exists")
-		"completed": False # whether this ever reached the end
+					continue
-	}).inserted_id
+				print("get", image_url)
-
+				res = rqs.get(image_url, headers={'referer':'https://www.pixiv.net'})
-
+				with open(download_path, "wb") as f:
-	the_id_of_the_first_result_on_the_previous_page = None
+					f.write(res.content)
-	for page_number in range(startpagenum, numpages+1):
+				print("saved", download_filename)
-		# download search results
+				download_count = download_count + 1
 		search_url = f"https://210.140.131.219/ajax/search/artworks/{encodeURI(tag, safe='')}?order=popular_d&mode=all&p={page_number}"
 		print("get", search_url)
 		search_data = rqs.get(search_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
 		# save raw search data
 		search_collection.update_one({"_id": search_document_id}, {"$set": {"current_page": page_number}, "$push": {"search_data": {
 			"page_number": page_number,
 			"search_url": search_url,
 			"search_data": search_data
 		}}})
 		if (search_data['error']):
 			print("error from search api:", search_data['message'])
 			exit(1)
 		search_results = search_data['body']['illustManga']['data']
 		# if there is no data then we are done
 		if not search_results:
 			print("No more search results")
 			search_collection.update_one({"_id": search_document_id}, {"$set":{"completed": True}})
 			exit()
 		# But large tags seem to give the last page of results for any page number
 		if search_results[0]['id'] == the_id_of_the_first_result_on_the_previous_page:
 			print("Reached duplicate search results, looks like the end")
 			search_collection.update_one({"_id": search_document_id}, {"$set":{"completed": True}})
 			exit()
 		else: the_id_of_the_first_result_on_the_previous_page = search_results[0]['id']
 		# for each search result
 		for illust_data_from_search in search_results:
 			illust_id = illust_data_from_search['id']
 			search_collection.update_one({"_id": search_document_id}, {"$set":{"current_illust": illust_id}})
 			# check if this illust was already saved
 			if illustration_collection.count_documents({"_id": illust_id}):
 				print("already have", illust_id)
 				continue
 			# illust_metadata_from_search has limited information (blank descriptions)
 			# download full meta data from ajax
 			illust_ajax_url = f"https://210.140.131.219/ajax/illust/{illust_id}"
 			print("get", illust_ajax_url)
 			illust_ajax_data = rqs.get(illust_ajax_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
 			if (illust_ajax_data['error']):
 				print("error from ajax api:", illust_ajax_data['message'])
 			# save animated works
 			ugoira_data = None
 			if (illust_ajax_data['body']['illustType'] == 2):
 				illust_ugoira_url = illust_ajax_url + "/ugoira_meta"
 				print("get", illust_ugoira_url)
 				illust_ugoira_data = rqs.get(illust_ugoira_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
 				if (illust_ugoira_data['error']):
 					print("error from ajax ugoira api:", illust_ugoira_data['message'])
 				else:
 					original_ugoira_url = illust_ugoira_data['body']['originalSrc']
 					print("get", original_ugoira_url)
 					res = rqs.get(original_ugoira_url, headers={'referer':'https://www.pixiv.net'})
 					print("gridfs put", original_ugoira_url)
 					ugoira_data = {
 						"gridfs_id": gridfs.put(res.content, filename=original_ugoira_url.split('/').pop(), original_url=original_ugoira_url, ugoira=True),
 						"ugoira_meta": illust_ugoira_data
 					}
 			# illust_ajax_data does not have "page" data (additional image urls)
 			# download that
 			illust_pages_url = illust_ajax_url + "/pages"
 			print("get", illust_pages_url)
 			illust_pages_data = rqs.get(illust_pages_url, cookies={"PHPSESSID": os.environ["PHPSESSID"]}, headers={"host":"www.pixiv.net"}).json()
 			if (illust_pages_data['error']):
 				print("error from ajax pages api:", illust_pages_data['message'])
 			else:
 				# prepare database document
 				document = {
 					"_id": illust_id, # use the unique artwork id for document id so we can't have duplicates
 					"illust_ajax_data": illust_ajax_data, # save all the metadata for the artwork
 					"illust_pages_data": illust_pages_data, # save all the image urls of the data
 					"downloaded_images": {}, # map of image filenames to gridfs ids
 					"ugoira_data": ugoira_data, # animation data
 					"date_saved": datetime.now()
 				}
 				# download originals
 				for illust_page_data in illust_pages_data['body']:
 					original_image_url = illust_page_data['urls']['original']
 					original_image_filename = original_image_url.split('/').pop()
 					print("get", original_image_url)
 					res = rqs.get(original_image_url, headers={'referer':'https://www.pixiv.net'})
 					print("gridfs put", res.url)
 					gridfs_id = gridfs.put(res.content, filename=original_image_filename, original_url=original_image_url)
 					document['downloaded_images'][original_image_filename] = gridfs_id
 				# add to db
 				illustration_collection.insert_one(document)
 				search_collection.update_one({"_id": search_document_id}, {"$push": {"results": illust_id}})
 	search_collection.update_one({"_id": search_document_id}, {"$set":{"completed": True}})
 	print("end of loop")
 download_popular(args.tag, args.startpagenum, args.numpages)
@@ -1,5 +1,3 @@
 requests==2.26.0
 requests-toolbelt==0.9.1
 urllib3==1.26.6
 pymongo==3.12.0
 python-dotenv==0.19.0