进公司面试的时候,被问到是否会写Python,因为毕业设计扒过GEO (Gene Expression Omnibus)的数据,所以就说会啊!然后发现进公司的第一个任务就是让我去扒各个直播平台房间数的数据hhhh....
- 我们需要首先在工作目录下(PythonIDE的默认工作目录或者自己制定的程序文件目录)解压放置一个PhantomJS无界面浏览器。这个浏览器最大的好处就是可以模拟用户浏览器的登陆,基本可以不用担心反爬虫的机制。
- 对于Python老手们来说Windows系统下安装一个Pycharm这个IDE也是事半功倍的,在Pycharm的Settings>Interpreter选项中可以UI界面安装package十分方便(我们接下来要使用的package也是这么安装的)
- Pycharm安装好之后,我们需要安装一系列的pachage:selenium,bs4,requests,lxml和threading。Python里头的一些自带包也不是启动时加载的,因此我们也需要到时候import它们。
1. 品类页面解析
def get_html(url): headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0: WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"} # headers 头是浏览器识别头,对于requests方法是需要使用的 request = requests.get(url=url, headers=headers) # 获取网页,url就是网页链接,自己输入的 response = request.text # 转变为文本用于后续分析 return response def get_category_html(html): soup = BeautifulSoup(html, "lxml") # create an BeautifulSoup object all_a = soup.find_all("a", class_="pic new-clickstat") # 著名的find_all函数 category_html = [] # 创建一个python list储存品类网址 for link in all_a: cate_html = [link["href"]] category_html += cate_html return category_html
2. 房间数页面解析
def get_roomnum(cate_url): headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0: WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"} request = requests.get(url=cate_url, headers=headers) response = request.text soup = BeautifulSoup(response, "lxml") room_title = soup.find("h2", class_="title").string # 先获得品类名称 page_num = int(soup.find("div", attrs={"class": "list-page", "id": "js-list-page", "data-pages": True})["data-pages"]) # 看看有多少页 if page_num == 1: all_a = soup.find_all("a", class_="title new-clickstat") room_num = len(all_a) else: # 如果大于一页的话,就用PhantomJS模拟浏览器进行下一步扒取 driver = webdriver.PhantomJS(executable_path="E:/AnchengDeng/phantomjs-2.1.1-windows/bin/phantomjs.exe") # 注意了!这里是exe程序文件的保存路径! driver.get(cate_url) # cate_url 是具体品类的url xpath_str = "//a[@data-page='" + str(page_num) + "']" # 这个是用于模拟浏览器网页元素定位的,这里我们定位最后一个页面 driver.find_element_by_xpath(xpath_str).click() # 模拟点击最后一页 import time time.sleep(2.5) # 让浏览器慢慢加载2.5s,凭经验判断的时间,可以加载完全 response = driver.page_source.encode("utf-8", "ignore") # 浏览器页面解析为utf-8格式 driver.close() # 要记得关闭浏览器 soup = BeautifulSoup(response, "lxml") all_a = soup.find_all("a", class_="title new-clickstat") # 确定最后一页的房间数 last_room_num = len(all_a) room_num = (page_num-1)*120 + last_room_num # 已确认每一页都有120个房间 thefile = open(company_name + '_roomNumber' + record_time + '.txt', 'a') # 写入房间数,open()里头的参数'a'表示append添加,不要用'w',后者是写入,会覆盖掉原来的条目 thefile.write("%s\t%s\t%s\t%s\n" % (room_title.encode('utf-8'), str(room_num), record_time, company_name)) thefile.close() print "Done with " + room_title + " (" + str(room_num) + ")" # 给自己一个完成的信号
# -*- coding:utf-8 -*- import requests import threading # multi-threading process from bs4 import BeautifulSoup from lxml import etree # dissolve the page, faster than default html.parser from selenium import webdriver from time import localtime, strftime record_time = strftime("%Y-%m-%d_%H-%M-%S", localtime()) # 设置记录时间,按照本机时间设置 company_name = "huya" # 虎牙公司 # obtain the source from a page: get the Live-broadcasting Category link def get_html(url): headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0: WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"} request = requests.get(url=url, headers=headers) response = request.text return response def get_category_html(html): soup = BeautifulSoup(html, "lxml") # create an object all_a = soup.find_all("a", class_="pic new-clickstat") category_html = [] for link in all_a: cate_html = [link["href"]] category_html += cate_html return category_html # ready for multi-thread def get_roomnum(cate_url): headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0: WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"} request = requests.get(url=cate_url, headers=headers) response = request.text soup = BeautifulSoup(response, "lxml") room_title = soup.find("h2", class_="title").string page_num = int(soup.find("div", attrs={"class": "list-page", "id": "js-list-page", "data-pages": True})["data-pages"]) if page_num == 1: all_a = soup.find_all("a", class_="title new-clickstat") room_num = len(all_a) else: driver = webdriver.PhantomJS(executable_path="E:/AnchengDeng/phantomjs-2.1.1-windows/bin/phantomjs.exe") driver.get(cate_url) xpath_str = "//a[@data-page='" + str(page_num) + "']" driver.find_element_by_xpath(xpath_str).click() import time time.sleep(2.5) response = driver.page_source.encode("utf-8", "ignore") driver.close() soup = BeautifulSoup(response, "lxml") all_a = soup.find_all("a", class_="title new-clickstat") last_room_num = len(all_a) room_num = (page_num-1)*120 + last_room_num thefile = open(company_name + '_roomNumber' + record_time + '.txt', 'a') thefile.write("%s\t%s\t%s\t%s\n" % (room_title.encode('utf-8'), str(room_num), record_time, company_name)) thefile.close() print "Done with " + room_title + " (" + str(room_num) + ")" def start_roomnum_collecting(category_html): # 这个函数是使用threading包进行多线程扒取的 threads = [] for item in category_html: th = threading.Thread(target=get_roomnum, args=(item, )) # get_roomnum是上面的一个函数,item是函数的一个参数 th.start() # 多线程爬虫开始了!好兴奋! threads.append(th) def main(): # 主函数,如果未来虎牙更换了网址而网页结构没有改变,直接修改start_url就好了 start_url = "http://www.huya.com/g" start_html = get_html(start_url) html = get_category_html(start_html) start_roomnum_collecting(html) # if __name__=="__main__": # main() while True: # 永久爬虫,每隔大约1h开启一次主函数进行扒取,满足了定时爬虫的需求 print "Starting on Collecting " + company_name + " room number, good luck!" main() import time time.sleep(3600 - 45)
Leave A Comment