新闻爬虫
from bs4 import BeautifulSoup
import requests
import sys
import random
import pymysql
links = []
datas = []
hea = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'
}
urls =["https://www.chinanews.com/china.shtml", "https://www.chinanews.com/society.shtml", "https://www.chinanews.com/compatriot.shtml","https://www.chinanews.com/wenhua.shtml","https://www.chinanews.com/world.shtml","https://www.chinanews.com/cj/gd.shtml","https://www.chinanews.com/sports.shtml","https://www.chinanews.com/huaren.shtml"
]
db = pymysql.connect(host='127.0.0.1', user='root', password='123456', port=3396, db='news_recommendation_system')
cursor = db.cursor()def main():baseurl = 'https://www.chinanews.com/taiwan.shtml' getLink(baseurl)getInformationAndSave()db.close()def getInformationAndSave():for link in links:data = []url = "https://www.chinanews.com" + link[1]cur_html = requests.get(url, headers=hea)cur_html.encoding = "utf8"soup = BeautifulSoup(cur_html.text, 'html.parser')title = soup.find('h1')title = title.text.strip()tr = soup.find('div', class_='left-t').text.split()time = tr[0] + tr[1]recourse = tr[2]cont = soup.find('div', class_="left_zw")content = cont.text.strip()print(link[0] + "---" + title + "---" + time + "---" + recourse + "---" + url)saveDate(title,content,time,recourse,url)def deleteDate():sql = "DELETE FROM news "try:cursor.execute(sql)db.commit()except:db.rollback()def saveDate(title,content,time,recourse,url):try:cursor.execute("INSERT INTO news(news_title, news_content, type_id, news_creatTime, news_recourse,news_link) VALUES ('%s', '%s', '%s', '%s', '%s' ,'%s')" % \(title, content, random.randint(1,8), time, recourse,url))db.commit()print("执行成功")except:db.rollback()print("执行失败")def getLink(baseurl):html = requests.get(baseurl, headers=hea)html.encoding = 'utf8'soup = BeautifulSoup(html.text, 'html.parser')for item in soup.select('div.content_list > ul > li'):if (item.a == None):continuedata = []type = item.div.text[1:3] link = item.div.next_sibling.next_sibling.a['href']data.append(type)data.append(link)links.append(data)if __name__ == '__main__':main()