廢話不多說直接開始
image
爬取京東單頁評論
一、找到商品評論
image
二、檢查網(wǎng)頁源代碼
搜索發(fā)現(xiàn)并沒有想要內(nèi)容
image
打開開發(fā)者工具查看抓取到的內(nèi)容發(fā)現(xiàn)有評論內(nèi)容,我們就可以對請url地址發(fā)送請求來獲取數(shù)據(jù)
image.png
image
三、開始編寫程序
代碼
import requests
import re
headers = {
'Connection': 'keep-alive',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'sec-ch-ua-mobile': '?0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36',
'Accept': '*/*',
'Sec-Fetch-Site': 'same-site',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Dest': 'script',
'Referer': 'https://item.jd.com/',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
cookies = {
'__jdu': '1140149181',
'shshshfpb': 'dxvAdGKVNzAegFZ04SPRPjw%3D%3D',
'shshshfpa': '551d8e1b-9679-a2a9-4853-c893fad3a0c2-1588218470',
'areaId': '13',
'ipLoc-djd': '13-1042-3528-0',
'unpl': 'V2_ZzNtbRBTFkYhDBZQeB4PBmIDEFwSXhYWcQERBykfWVFkBEcJclRCFnUUR1NnGFkUZwsZX0RcQBxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHsZWQRnBhpdS1dzJXI4dmR4HVsHZgIiXHJWc1chVERTcx1bACoDElhDV0YddQFGZHopXw%3d%3d',
'__jdv': '76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_c5dbe5a526b24130a8d258fbc74e26de|1622806649582',
'__jda': '122270672.1140149181.1607922153.1622723717.1622806650.7',
'__jdc': '122270672',
'_gcl_au': '1.1.1090722577.1622806657',
'shshshfp': '2642ee3c640efeb0e9447e6545757fca',
'__jdb': '122270672.5.1140149181|7.1622806650',
'shshshsID': '3742c97b007a5cf7adaa9cff4323c957_3_1622806689501',
'JSESSIONID': '81C1CD5FA9D0F18E7FFADD802EC34264.s1',
'jwotest_product': '99',
'3AB9D23F7A4B3C9B': '6NAODRKK6T33JSTFT3NYNWJAJQ2BCPHUZTUM73ZFAJPIMAS44RCYDE4BC6G7LRUPAWKISABMYIUWYB2LIDAMRKRPVU',
}
url="https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=100020318814&score=0&sortType=5&page=2&pageSize=10&isShadowSku=0&rid=0&fold=1"
response = requests.get(url=url, headers=headers,cookies=cookies).text
res='"content":"(.*?)"'
content=re.findall(res,response)
for index,i in enumerate(content):
print("第{}條評論:".format(index+1)+i+"\n")
運(yùn)行結(jié)果
image
京東多頁評論爬取
image
通過前三頁url地址比對可以發(fā)現(xiàn)只有page的值有變化谷婆,所以我們可以用列表推導(dǎo)式來獲取前10頁url依次發(fā)送請求來實(shí)現(xiàn)多頁爬取
def url_list():
url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=100020318814&score=0&sortType=5&page={}&pageSize=10&isShadowSku=0&rid=0&fold=1"
url_list=[url.format(i) for i in range(1,11)]
return url_list
image
編寫程序
代碼
import requests
import re
def url_list():
url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=100020318814&score=0&sortType=5&page={}&pageSize=10&isShadowSku=0&rid=0&fold=1"
url_list=[url.format(i) for i in range(1,11)]
return url_list
def url_parse(list):
index = 1
for url in list:
headers = {
'Connection': 'keep-alive',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'sec-ch-ua-mobile': '?0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36',
'Accept': '*/*',
'Sec-Fetch-Site': 'same-site',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Dest': 'script',
'Referer': 'https://item.jd.com/',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
cookies = {
'__jdu': '1140149181',
'shshshfpb': 'dxvAdGKVNzAegFZ04SPRPjw%3D%3D',
'shshshfpa': '551d8e1b-9679-a2a9-4853-c893fad3a0c2-1588218470',
'areaId': '13',
'ipLoc-djd': '13-1042-3528-0',
'unpl': 'V2_ZzNtbRBTFkYhDBZQeB4PBmIDEFwSXhYWcQERBykfWVFkBEcJclRCFnUUR1NnGFkUZwsZX0RcQBxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHsZWQRnBhpdS1dzJXI4dmR4HVsHZgIiXHJWc1chVERTcx1bACoDElhDV0YddQFGZHopXw%3d%3d',
'__jdv': '76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_c5dbe5a526b24130a8d258fbc74e26de|1622806649582',
'__jda': '122270672.1140149181.1607922153.1622723717.1622806650.7',
'__jdc': '122270672',
'_gcl_au': '1.1.1090722577.1622806657',
'shshshfp': '2642ee3c640efeb0e9447e6545757fca',
'__jdb': '122270672.5.1140149181|7.1622806650',
'shshshsID': '3742c97b007a5cf7adaa9cff4323c957_3_1622806689501',
'JSESSIONID': '81C1CD5FA9D0F18E7FFADD802EC34264.s1',
'jwotest_product': '99',
'3AB9D23F7A4B3C9B': '6NAODRKK6T33JSTFT3NYNWJAJQ2BCPHUZTUM73ZFAJPIMAS44RCYDE4BC6G7LRUPAWKISABMYIUWYB2LIDAMRKRPVU',
}
response = requests.get(url=url, headers=headers, cookies=cookies).text
res = '"content":"(.*?)"'
content = re.findall(res, response)
for i in content:
print(f"第{index}條評論:"+ i + "\n")
index+=1
if __name__ == '__main__':
list=url_list()
url_parse(list)
運(yùn)行結(jié)果
image
爬取京東多頁評論生成excel表格
前面我們實(shí)現(xiàn)了爬取京東多頁評論的功能,現(xiàn)在只要利用Pandas生成excel就可以
image
def excel(i,c,t):
x=pd.DataFrame()
x["時(shí)間"]=t //t,i,c類型為列表
x["ID"]=i
x["評論內(nèi)容"]=c
x.to_excel("./京東評論.xlsx")
代碼
import requests
import re
import pandas as pd
def url_list():
url = "https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98&productId=100020318814&score=0&sortType=5&page={}&pageSize=10&isShadowSku=0&rid=0&fold=1"
url_list=[url.format(i) for i in range(1,11)]
return url_list
def url_parse(list):
index = 1
id_list = []
content_list=[]
time_list=[]
for url in list:
headers = {
'Connection': 'keep-alive',
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
'sec-ch-ua-mobile': '?0',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36',
'Accept': '*/*',
'Sec-Fetch-Site': 'same-site',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Dest': 'script',
'Referer': 'https://item.jd.com/',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
cookies = {
'__jdu': '1140149181',
'shshshfpb': 'dxvAdGKVNzAegFZ04SPRPjw%3D%3D',
'shshshfpa': '551d8e1b-9679-a2a9-4853-c893fad3a0c2-1588218470',
'areaId': '13',
'ipLoc-djd': '13-1042-3528-0',
'unpl': 'V2_ZzNtbRBTFkYhDBZQeB4PBmIDEFwSXhYWcQERBykfWVFkBEcJclRCFnUUR1NnGFkUZwsZX0RcQBxFCEdkeBBVAWMDE1VGZxBFLV0CFSNGF1wjU00zQwBBQHcJFF0uSgwDYgcaDhFTQEJ2XBVQL0oMDDdRFAhyZ0AVRQhHZHsZWQRnBhpdS1dzJXI4dmR4HVsHZgIiXHJWc1chVERTcx1bACoDElhDV0YddQFGZHopXw%3d%3d',
'__jdv': '76161171|baidu-pinzhuan|t_288551095_baidupinzhuan|cpc|0f3d30c8dba7459bb52f2eb5eba8ac7d_0_c5dbe5a526b24130a8d258fbc74e26de|1622806649582',
'__jda': '122270672.1140149181.1607922153.1622723717.1622806650.7',
'__jdc': '122270672',
'_gcl_au': '1.1.1090722577.1622806657',
'shshshfp': '2642ee3c640efeb0e9447e6545757fca',
'__jdb': '122270672.5.1140149181|7.1622806650',
'shshshsID': '3742c97b007a5cf7adaa9cff4323c957_3_1622806689501',
'JSESSIONID': '81C1CD5FA9D0F18E7FFADD802EC34264.s1',
'jwotest_product': '99',
'3AB9D23F7A4B3C9B': '6NAODRKK6T33JSTFT3NYNWJAJQ2BCPHUZTUM73ZFAJPIMAS44RCYDE4BC6G7LRUPAWKISABMYIUWYB2LIDAMRKRPVU',
}
response = requests.get(url=url, headers=headers, cookies=cookies).text
res_content = '"content":"(.*?)"'
res_id = '"guid":"(.*?)"'
res_time='"creationTime":"(.*?)"'
content = re.findall(res_content, response)
id= re.findall(res_id, response)
time = re.findall(res_time, response)
for i,c,t in zip(id,content,time):
id_list.append(i)
content_list.append(c)
time_list.append(t)
print(id_list,content_list,time_list)
return id_list,content_list,time_list
def excel(i,c,t):
x=pd.DataFrame()
x["時(shí)間"]=t
x["ID"]=i
x["評論內(nèi)容"]=c
x.to_excel("./京東評論.xlsx")
if __name__ == '__main__':
list=url_list()
i,c,t=url_parse(list)
excel(i,c,t)