1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
| import json import requests from bs4 import BeautifulSoup
news_list = [] info = {"car_new": {"news": news_list}}
res = requests.get("https://www.autohome.com.cn/news/1/#liststart")
soup = BeautifulSoup(res.text, 'html.parser') ul_list = soup.find_all(name='ul', class_='article')
for ul in ul_list: li_list = ul.find_all(name='li') for li in li_list: h3 = li.find(name='h3') if h3: title = h3.text desc = li.find(name='p').text img = li.find(name='img')['src'] if not img.startswith("http"): img = "https://" + img url = "https:" + li.find(name='a')['href'] news_dict = { "新闻标题": title, "新闻摘要": desc, "新闻图片": img, "新闻地址": url, } news_list.append(news_dict) res_img=requests.get(img) img_name=img.split('/')[-1] with open('./img/%s'%img_name,'wb') as f: for line in res_img.iter_content(1024): f.write(line)
with open("news.json", "a") as f: json.dump(info, f, ensure_ascii=False)
|