拉勾网多线程爬虫 selenium+threadpool

2019-07-01

一、数据准备

还是延续之前的步骤，先做爬虫，再做数据清洗，挖掘。

爬虫的时候还遇到点问题，之前调用拉勾网 positionAjax.json 接口非常好用，现在尝试各种请求都返回请求频繁，但状态码是200，也就是接口还是收到了请求，只是因为某项没有符合所以不给数据，用浏览器访问也正常，并不是字面的‘请求频繁’的问题，猜测可能是做了客户端ip限制，访问的不是部署的ip即算作是外部爬虫，所以选择用selenium解决。

用多线程暂时没有遇到封ip的问题，如果有可以找一些开放的代理ip解决。

数据结果

完整代码如下：

from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import pandas as pd
import time
from pandas.tseries.offsets import Day
import requests
import os
from threadpool import ThreadPool,makeRequests
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED, FIRST_COMPLETED
import threading
import datetime
import json

def main():
    chrome_options = Options()
    # chrome_options.add_argument('--no-sandbox')  # 解决DevToolsActivePort文件不存在的报错
    # chrome_options.add_argument('window-size=1920x3000')  #k 指定浏览器分辨率
    # chrome_options.add_argument('--disable-gpu')  # 谷歌文档提到需要加上这个属性来规避bug
    # chrome_options.add_argument('--hide-scrollbars')  # 隐藏滚动条, 应对一些特殊页面
    # chrome_options.add_argument('blink-settings=imagesEnabled=false')  # 不加载图片, 提升速度
    # chrome_options.add_argument('--headless')  # 浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败

    keyword_list=[
        '深度学习'#,
        # '算法',
        # '数据挖掘',
        # '自然语言处理',
        # '计算机视觉',
        # '推荐算法',
        # 'python',
        # '语音识别',
        # '数据分析'
        ]
    base_url="https://www.lagou.com/jobs/list_{keyword}/p-city_{code}?&cl=false&fromSearch=true&labelWords=&suginput="
    selected_city=['北京','上海','广州','深圳','杭州','武汉','南京','成都','长沙']
    selected_codes=['2','3','213','215','6','184','79','252','198']
    # //*[@id="s_position_list"]/div[2]/div/span[7]
    # //*[@id="tab_pos"]

    pool = ThreadPool(5)
    param=[]
    for i,selected_code in enumerate(selected_codes):
        # 依次遍历城市URL
        for word in keyword_list:
            driver = webdriver.Chrome(options=chrome_options)
            param.append(((i,selected_code,word,driver),None))

    reqs = makeRequests(get_detail,param)

    [pool.putRequest(req) for req in reqs] 

    #wait(t, return_when=ALL_COMPLETED)
    pool.wait()
    #pool.close()
    file_list=os.listdir('./data')
    if os.path.exists('./data.csv')!=False:
        os.remove('./data.csv')
    for n in file_list:
        data=pd.read_csv('./data'+'/'+n+'.csv')
        data.to_csv('./data.csv',mode='a',index=None)


def get_detail(i,selected_code,word,driver):
    base_url="https://www.lagou.com/jobs/list_{keyword}/p-city_{code}?&cl=false&fromSearch=true&labelWords=&suginput="
    selected_city=['北京','上海','广州','深圳','杭州','武汉','南京','成都','长沙']
    selected_codes=['2','3','213','215','6','184','79','252','198']
    output=[]
    city_detail_url = base_url.format(code=selected_code,keyword=word)
    # 首页
    driver.get(city_detail_url)
    if driver.find_element_by_xpath('/html/body/div[7]/div/div[2]')!=None:
        driver.find_element_by_xpath('/html/body/div[7]/div/div[2]').click()
    count=driver.find_element_by_xpath('//*[@id="tab_pos"]').text
    page=driver.find_elements_by_xpath('//*[@id="s_position_list"]/div[3]/div/span')[-1]
    loop=True
    while loop:
        time.sleep(2)
        city_list = driver.find_elements_by_xpath('//*[@id="s_position_list"]/ul/li')
        for city_detail in city_list:
            item = {}

            info_primary=city_detail.find_element_by_xpath('.//div[1]/div[1]')
            info_company=city_detail.find_element_by_css_selector('.company')
            info_pic=city_detail.find_element_by_xpath('.//div[1]/div[3]')
            info_type=city_detail.find_element_by_css_selector('.list_item_bot .li_b_l')
            info_fuli=city_detail.find_element_by_css_selector('.list_item_bot .li_b_r')


            item['cityname'] = selected_city[i]#info_primary.find_element_by_xpath('.//p/text()').extract()[0].split(" ")[0].strip()
            item['keyword'] = word#info_primary.find_element_by_xpath('.//p/text()').extract()[0].split(" ")[0].strip()
            item['title'] = info_primary.find_element_by_xpath('.//div[1]/a/h3').text 
            item['area'] = info_primary.find_element_by_xpath('.//div[1]/a/span').text 
            item['publishdate'] = info_primary.find_element_by_xpath('.//div[1]/span').text 
            item['salary'] = info_primary.find_element_by_xpath('.//div[2]').text.split(' ')[0] 
            item['experience'] = info_primary.find_element_by_xpath('.//div[2]').text.split(' ')[1].split('/')[0]
            item['education'] = info_primary.find_element_by_xpath('.//div[2]').text.split(' ')[1].split('/')[1]

            item['company'] = info_company.find_element_by_xpath('.//div[1]').text
            comp=info_company.find_element_by_xpath('.//div[2]').text.split('/')
            item['industry'] = comp[0]
            item['finance'] = comp[1]
            item['company_size'] =  comp[2] 

            item['logo'] = info_pic.find_element_by_xpath('.//a/img').get_attribute('src')

            item['recruiter'] = info_type.text 

            item['benefits'] = info_fuli.text 
            #print(item['title'])
            output.append(item)
        #获取页面元素
        if page.get_attribute('class')=='pager_next ':
            page.click()
            page=driver.find_elements_by_xpath('//*[@id="s_position_list"]/div[3]/div/span')[-1]
        else:
            loop=False

    output=pd.read_json(json.dumps(output))
    output.to_csv('./data/data{name}.csv'.format(name='_'+selected_city[i]+'_'+word),index=None)
    driver.close()
    driver.quit()

if __name__ == "__main__":
    main()

本文仅做技术学习分享，数据请勿商用，如对拉勾网有影响，请联系本人即删。

阅读量