代码
def parseBaidu(keyword, pagenum): keywordsBaseURL = 'https://www.baidu.com/s?wd=' + str(quote(keyword)) + '&oq=' + str(quote(keyword)) + '&ie=utf-8' + '&pn=' pnum = 0 while pnum <= int(pagenum): baseURL = keywordsBaseURL + str(pnum*10) try: request = requests.get(baseURL, headers=headers) soup = BeautifulSoup(request.text, "html.parser") for a in soup.select('div.c-container > h3 > a'): url = requests.get(a['href'], headers=headers).url yield url except: yield None finally: pnum += 1
示例用法
import requestsfrom bs4 import BeautifulSoupheaders = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0"}def parseBaidu(keyword, pagenum)def main(): for url in parseBaidu("keyword",10): if url: print(url) else: continue