Python爬虫学习

整理了一下以前写过的一些小爬虫。

12306查票

# 12306Check.py
import requests
import pandas as pd
import json

# 反复查询加循环即可
f = open('stations.json',mode='r',encoding='utf-8')
text = f.read()
city_json = json.loads(text)
f.close()
from_station = input('始发站:')
to_station = input('终点站:')
train_date = input('发车日期(格式:2022-04-03):')
# from_station = '武汉'
# to_station = '长沙'
# train_date = '2022-04-06'
# print(city_json[from_station])
# print(city_json[to_station])
# print(train_date)
url = "https://kyfw.12306.cn/otn/leftTicket/query"
data = {
'leftTicketDTO.train_date': train_date,
'leftTicketDTO.from_station': city_json[from_station],
'leftTicketDTO.to_station': city_json[to_station],
'purpose_codes': 'ADULT'
}
headers ={
'Cookie': '_uab_collina=164896135403377792450208; JSESSIONID=843F9AA2FC700ED2B8E8655CBA8E4AFD; BIGipServerotn=468713994.50210.0000; BIGipServerpool_passport=182714890.50215.0000; highContrastMode=defaltMode; guidesStatus=off; cursorStatus=off; RAIL_EXPIRATION=1649305414970; RAIL_DEVICEID=i7hoNlwMechBKbH0Ghsk6cswUh1nhyGKgfThBDYYiAGy_zBKBevr3wq6zqlS1GV-JvK0B-qVEYwihXSg-WVTUlOcG7NgUJc2INRx-UN6MMxdgg6sumeKjo2NXUwAOlNL7uSajwkmH4Aleee-puGjfAVFL2aVyYMY; route=9036359bb8a8a461c164a04f8f50b252; _jc_save_toStation=%u957F%u6C99%2CCSQ; _jc_save_fromDate=2022-04-03; _jc_save_toDate=2022-04-03; _jc_save_wfdc_flag=dc; _jc_save_fromStation=%u6B66%u6C49%2CWHN',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36'
}

response = requests.get(url=url,params=data,headers=headers)
# <Response [200]> 获取响应体对象
# response.text 获取响应体的文本数据

# 乱码时可以使用以下方法
# response.encoding = 'utf-8' #指定utf-8
response.encoding = response.apparent_encoding #自动转码
result = response.json()['data']['result']
lis = []
for index in result:
index_list = index.replace('有','Yes').replace('无','No').split('|')
# index_list = index.split('|')
TrainNum = index_list[3]
StartTime = index_list[8]
EndTime = index_list[9]
UseTime = index_list[10]
if('G' in TrainNum):
BussinessSeats = index_list[32] #商务座
FirstSeats = index_list[31] #一等座
SecondSeats = index_list[30] #二等座
di = {
'Num': TrainNum,
'Start': StartTime,
'End': EndTime,
"Use": UseTime,
"Top": BussinessSeats,
"First": FirstSeats,
"Second": SecondSeats,
"Soft": '',
"HBed": '',
"HSeats": '',
"NSeats": ''
}
else:
SoftBeds = index_list[23] #软卧
HardBeds = index_list[28] #硬卧
HardSeats = index_list[29] #硬座
NoSeats = index_list[26] #无座
di = {
'Num': TrainNum,
'Start': StartTime,
'End': EndTime,
"Use": UseTime,
"Top": '',
"First": '',
"Second": '',
"Soft": SoftBeds,
"HBed": HardBeds,
"HSeats": HardSeats,
"NSeats": NoSeats
}
# print(di)
lis.append(di)
pd.set_option('display.max_rows',None) #展示最大化,把所有行展示出来
content = pd.DataFrame(lis)
print(content)

image-20220406083305519.png

生成stations.json文件:

#get_station_code.py
import time
import json
import requests
from requests.exceptions import RequestException

def getResponse(url):
try:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
response = requests.get(url,headers = headers)
if response.status_code == 200:
return response
return None
except RequestException:
return None


if __name__ == "__main__":
url = "https://kyfw.12306.cn/otn/resources/js/framework/station_name.js"
data = getResponse(url)
if data is not None:
dict_data = {}
text = data.text
str_split = text.split('@')
for chars in str_split[1:]:
station = chars.split('|')
dict_data[station[1]] = station[2]
with open("stations.json",'w',encoding = 'utf-8') as fp:
json.dump(dict_data,fp,ensure_ascii = False)

网易云热歌爬取

# 163MusicGet.py
import requests
import re
import os

def change_name(name):
new_name = re.sub(r'[\/\\\:\*\?\"\<\>\|\✩\˚\*\•̩̩͙ʚ]','_',name) #各个符号间用转义字符隔开
return new_name

filename = "music\\"
if not os.path.exists(filename):
os.mkdir(filename)

url = "https://music.163.com/discover/toplist?id=19723756" #不同榜单切换该id即可
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36'
}

response = requests.get(url=url,headers=headers)
html_data = re.findall('<li><a href="/song\?id=(\d+)">(.*?)</a>',response.text) # ?前面记得加个转义字符
for music_id,music_name in html_data:
# 下载地址:https://music.163.com/song/media/outer/url?id=1933915376.mp3
music_url = f"https://music.163.com/song/media/outer/url?id={music_id}.mp3"
music_content = requests.get(url=music_url,headers=headers).content
music_name = change_name(music_name)
with open(filename + music_name +'.mp3',mode='wb') as f:
f.write(music_content)
print(music_id,music_name)

MP3文件保存在music文件夹下:

image-20220406084013424.png

古诗词爬取

# PoetryGet.py
import requests
import re
import os

url = "https://www.oh100.com/shici/1127009.html"
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36'
}

response = requests.get(url=url,headers=headers)
response.encoding = response.apparent_encoding
li = re.findall("\d+?、(.*?)</p><p>",response.text)
li[-1] = re.sub(r"</p><script>s\(\"content_relate\"\)\;\<\/script\>\<p\>\【形容洒脱的诗词\】相关文章\:","",li[-1])
for se in li:
print("- "+se)

image-20220406084235256.png

壁纸爬取

# WallPaperGet.py
import requests
import re
import os

"""
整体思路:
1、访问网站查看信息(F12不可用可以先在别的网页打开再访问该网址)
2、进入对应tag观察request的headers等信息(注意要点击加载更多后再看XHR下的文件)
3、进入特定照片观察信息
4、在text属性中获取title、img_url等信息
5、在本地保存下来
"""

filename = "img\\"
if not os.path.exists(filename):
os.mkdir(filename)

def change_title(name): #重命名文件,防止原有保存文件名不合法
new_name = re.sub(r'[\/\\\:\*\?\"\<\>\|]','_',name) #各个符号间用转义字符隔开
return new_name

for page in range(1,6): #指定页数
url = f"https://m.bcoderss.com/tag/%e7%be%8e%e5%a5%b3/page/{page}/" # %e7%be%8e%e5%a5%b3代表了美女这个标签,可以自行选择
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36'
}
response = requests.post(url=url,headers=headers) #该网站是个post请求但是不需要传params
# print(response.text)
href = re.findall('<li><a target="_blank" href="(.*?)"',response.text)[2:] #找到所有的照片初始地址,前两个是无用地址
for index in href:
response_1 = requests.get(url=index,headers=headers) #访问照片初始地址
title = re.findall("<title>(.*?)</title>",response_1.text)[0] #获取照片标题
title = change_title(title)
img_url = re.findall('<img alt=".*?" title=".*?" src="(.*?)">',response_1.text)[0] #获取照片下载地址
img_content = requests.get(url=img_url,headers=headers).content #获取照片内容
with open(filename + title + '.jpg',mode='wb') as f: #保存照片
f.write(img_content)
print(title,img_url)

壁纸会保存在img文件夹下(图片不是很适合放出来,就放下文字吧…)

image-20220406084429168.png

LOL英雄皮肤图片爬取

# LOL.py
import requests
import re
import os

headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36'
}

def change_name(name):
new_name = re.sub(r'[\/\\\:\*\?\"\<\>\|\✩\˚\*\•̩̩͙ʚ]','_',name) #各个符号间用转义字符隔开
return new_name

def save_img(name,title,skin_name,skin_url):
filename = f'{name+title}\\'
if not os.path.exists(filename):
os.mkdir(filename)
content = requests.get(url=skin_url,headers=headers).content
newname = change_name(skin_name)
with open(filename+newname+'.jpg',mode='wb') as f:
f.write(content)
print(skin_name)

hero_list_url = "https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js?ts=2748291"
response = requests.get(url=hero_list_url,headers=headers)
hero_list = response.json()['hero']
for hero in hero_list:
hero_id = hero['heroId']
hero_name = hero['name']
hero_title = hero['title']
hero_url = f"https://game.gtimg.cn/images/lol/act/img/js/hero/{hero_id}.js"
response_1 = requests.get(url=hero_url,headers=headers)
print(hero_id,hero_name,hero_title)
skin_list = response_1.json()['skins']
for skin in skin_list:
if(skin['mainImg']):
skin_url = skin['mainImg']
skin_name = skin['name']
save_img(hero_name,hero_title,skin_name,skin_url)
# print(skin_url)

各个英雄的皮肤会保存在自己的文件下:

image-20220406084813906.png

image-20220406084841993.png

王者荣耀英雄皮肤爬取

# wzry.py
import requests
import re
import os
import json

def change_name(name):
new_name = re.sub(r'[\/\\\:\*\?\"\<\>\|\✩\˚\*\•̩̩͙ʚ]','_',name) #各个符号间用转义字符隔开
return new_name

def save_img(name,title,skin_name,skin_url):
filename = f"{title+name}\\"
if not os.path.exists(filename):
os.mkdir(filename)
content = requests.get(url=skin_url,headers=headers).content
with open(filename+skin_name+'_'+name+'.jpg',mode='wb') as f:
f.write(content)

headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36'
}
herolist_url = "https://pvp.qq.com/web201605/js/herolist.json"
herolist = requests.get(url=herolist_url,headers=headers).json()
for hero in herolist:
hero_ename = hero['ename']
hero_name = hero['cname']
hero_title = hero['title']
print(hero_ename,hero['cname'])
if('skin_name' in hero):
skin_list = hero['skin_name'].split('|')
else:
hero_url = f"https://pvp.qq.com/web201605/herodetail/{hero_ename}.shtml"
response = requests.get(url=hero_url,headers=headers)
response.encoding = response.apparent_encoding
tu = re.findall('<ul class=\"pic-pf-list pic-pf-list3\" data-imgname=\"(.*?)\&0\|(.*?)\&0\|(.*?)\&72\">',response.text)
skin_list = list(tu[0])
skin_lens = len(skin_list)
for i in range(skin_lens):
skin_url = f"https://game.gtimg.cn/images/yxzj/img201606/heroimg/{hero_ename}/{hero_ename}-bigskin-{i+1}.jpg"
skin_name = skin_list[i]
save_img(hero_name,hero_title,skin_name,skin_url)
print(skin_name,hero_name)

文件保存思路与LOL一致:

image-20220406085029196.png

image-20220406085046682.png

未完待续……