课程目标
爬去智联招聘
课程内容
import requests # 导入requests库,用于发送网络请求
from bs4 import BeautifulSoup # 导入BeautifulSoup库,用于解析HTML文档
from tqdm import tqdm # 导入tqdm库,用于显示进度条
import pandas as pd # 导入pandas库,用于数据处理和导出Excel文件# 定义一个函数,用于将薪资字符串转换为数值
def tran_salary(ori_salary):if "万" in ori_salary:ori_salary = ori_salary.replace("万","") # 去掉“万”字ori_salary = float(ori_salary) # 转换为浮点数ori_salary *= 10000 # 转换为数值elif "千" in ori_salary:ori_salary = ori_salary.replace("千","") # 去掉“千”字ori_salary = float(ori_salary) # 转换为浮点数ori_salary *= 1000 # 转换为数值return ori_salary # 返回转换后的薪资数值# 设置请求头,模仿浏览器发送请求
headers = {"authority": "www.zhaopin.com","cache-control": "max-age=0","sec-ch-ua": "\"Chromium\";v=\"92\", \" Not A;Brand\";v=\"99\", \"Microsoft Edge\";v=\"92\"","sec-ch-ua-mobile": "?0","upgrade-insecure-requests": "1","user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67","accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9","sec-fetch-site": "same-origin","sec-fetch-mode": "navigate","sec-fetch-user": "?1","sec-fetch-dest": "document","referer": "https://www.zhaopin.com/sou/jl801/kw01O00U80EG06G03F01N0/p1","accept-language": "zh-CN,zh;q=0.9"
}# 设置cookies,用于维持会话
cookies = {# 省略了具体的cookie值,这些值通常在登录后由浏览器保存
}# 定义要爬取的URL
url = "https://www.zhaopin.com/sou/jl801/kw01O00U80EG06G03F01N0/p1"# 使用requests.get发送请求,获取网页内容
response = requests.get(url, headers=headers, cookies=cookies)# 使用BeautifulSoup解析网页内容
html_str = response.text
soup = BeautifulSoup(html_str,"html.parser")# 找到所有职位信息的HTML元素
joblist = soup.find_all("div",class_="joblist-box__item")# 初始化一个空列表,用于存储抓取的数据
infos = []# 使用for循环遍历每个职位信息元素
for job_item in tqdm(joblist): # tqdm用于显示进度条# 获取职位名称和公司名称jobinfo__name = job_item.find("a",class_="jobinfo__name").text.strip()company_name = job_item.find("a",class_="companyinfo__name").text.strip()# 获取薪资信息jobinfo__salary = job_item.find("p",class_="jobinfo__salary").text.strip()if jobinfo__salary == '面议':salary = 0else:if "·" in jobinfo__salary:jobinfo__salary = jobinfo__salary.split("·")[0]min_salary,max_salary = jobinfo__salary.split("-")min_salary = tran_salary(min_salary)max_salary = tran_salary(max_salary)salary = (min_salary + max_salary) / 2 # 计算平均薪资# 获取技能要求jobinfo__tag = job_item.find("div",class_="jobinfo__tag")skills = [] # 技能要求列表if jobinfo__tag is not None:joblist_box__item_tags = jobinfo__tag.findAll("div")for joblist_box__item_tag in joblist_box__item_tags:skills.append(joblist_box__item_tag.text)# 获取其他信息,如地区、经验要求、学历要求jobinfo__other_info = job_item.find("div",class_="jobinfo__other-info")jobinfo__other_infos = jobinfo__other_info.find_all("div")area = jobinfo__other_infos[0].text.strip()area_strs = area.split("·")region, classify, city = "","",""if len(area_strs) > 2:region = area_strs[2]if len(area_strs) > 1:classify = area_strs[1]if len(area_strs) > 0:city = area_strs[0]experience_requirement = jobinfo__other_infos[1].text.strip()if experience_requirement == "经验不限":experience_requirement = "0"experience_requirement = experience_requirement.replace("年","")if "-" in experience_requirement:experience_requirement_list = experience_requirement.split("-")experience_requirement = experience_requirement_list[0]experience_requirement = int(experience_requirement)education_background_requirement = jobinfo__other_infos[2].text.strip()# 获取公司信息,如融资信息、公司规模、公司类型companyinfo__tag = job_item.find("div",class_="companyinfo__tag")comany_info_items = companyinfo__tag.findAll("div")finance_info = comany_info_items[0].text.strip()scale = comany_info_items[1].text.strip()if len(comany_info_items) > 2:conany_type = comany_info_items[2].text.strip()else:conany_type = ""# 将抓取的信息以字典形式添加到infos列表中info = {"公司名字": company_name, "薪资": salary, "技能要求": skills, "市": city, "区": classify, "区域": region, "经验要求": experience_requirement, "学历要求": education_background_requirement, "融资信息": finance_info, "规模": scale, "公司类型": conany_type}infos.append(info)# 使用pandas将infos列表转换为DataFrame
df = pd.DataFrame(infos)# 将DataFrame保存为Excel文件
df.to_excel("智联职位信息.xlsx", index=False)