python百度下拉框关键词采集源码:
#百度下拉框关键词采集 import requests import json import urllib def get_keywords(word): #获取下拉词json数据并返回数组 url=f"https://www.baidu.com/sugrec?pre=1&ie=utf-8&json=1&prod=pc&wd={word}" html=requests.get(url) html=html.json() #print(html) #print(html['g']) key_words=[] for key_word in html['g']: print(key_word['q']) key_words.append(key_word['q']) #print(key_words) return key_words def get_sug(word): #获取下拉词json数据并返回数组2 url = 'https://sp0.baidu.com/5a1Fazu8AA54nxGko9WTAnF6hhy/su?wd=%s&sugmode=2&json=1&p=3&sid=1427_21091_21673_22581&req=2&pbs=%%E5%%BF%%AB%%E6%%89%%8B&csor=2&pwd=%%E5%%BF%%AB%%E6%%89%%8B&cb=jQuery11020924966752020363_1498055470768&_=1498055470781' % word r = requests.get(url, verify=False) # 请求API接口,取消了HTTPS验证 cont = r.content # 获取返回的内容 res = cont[41: -2].decode('gbk') # 只取返回结果中json格式一段,并且解码为unicode res_json = json.loads(res) # json格式转换 return res_json['s'] # 返回关键词列表 def get_word(word): #获取下拉词json数据并返回数组3 url=f'http://suggestion.baidu.com/su?wd={word}&sugmode=3&json=1' html=requests.get(url).text html=html.replace("window.baidu.sug(",'') html = html.replace(")", '') html = html.replace(";", '') #print(html) html = json.loads(html) key_words=html['s'] #print(key_words) return key_words def get_word_scv(): #获取下拉词json数据并写入excel opencsv=open('word.csv','a+') for word in open('gjc.txt',encoding='utf-8'): print(urllib.parse.quote_plus(word)) url='https://sp0.baidu.com/5a1Fazu8AA54nxGko9WTAnF6hhy/su?wd=%s&sugmode=2&json=1&p=3&sid=1427_21091_21673_22581&req=2' % urllib.parse.quote_plus(word) html=requests.get(url).text html=html.replace('window.baidu.sug(','') html=html.replace(');','') #print (html) html=json.loads(html) #print (html['s']) for i in html['s']: print (i) opencsv.write('%s\n'%i) def get_more_word(word): #查询所以下拉词并去除重复 more_word=[] for i in 'abcdefghijklmnopqrstuvwxyz': more_word.extend(get_keywords('%s%s'%(word,i))) print(more_word) print(len(more_word)) print(len(list(set(more_word)))) return list(set(more_word)) #去重操作 def get_more_sug(word): #查询所以下拉词并去除重复 all_words = [] for i in 'abcdefghijklmnopqrstuvwxyz': all_words += get_sug(word+i) # 遍历字母表 | 利用了上一个函数 print(len(list(set(all_words)))) return list(set(all_words)) # 去重 |
提供多种python百度下拉框关键词采集方式,基于百度API接口实现,可导出到Excel表格,本文提供4个采集函数及两个汇总函数,根据自己的需求灵活使用。