BeautifulSoup4的使用

发表于 2022-02-08 分类于 crawler 阅读次数：本文字数： 11k 阅读时长 ≈ 10 分钟

爬取梨视频

requests 模拟发送 http 请求
解析库进行解析，有 re bs4 lxml
requests-html: 发送请求+解析xml
  
视频去水印: fmmpeg模块 加水印，拼接裁剪，抠图，转码

import re
import requests

# res = requests.get("https://www.pearvideo.com/")
# print(res.text)

# 页面中发送有 Ajax 请求得到存放视频地址的页面
# https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=59&start=240&mrd=0.4791462124097654&filterIds=1761446,1761399,1761415,1745287,1745362,1745246,1745234,1745171,1745100,1745122,1745112,1744971,1744958,1744950,1744934
# https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=59&start=0

# res = requests.get("https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=59&start=0")
# print(res.text)

# 取出视频 video
# <a href="video_1761446" class="vervideo-lilink actplay">
# https://www.pearvideo.com/video_1761399


res = requests.get("https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=59&start=0")
video_list = re.findall('<a href="(.*?)" class="vervideo-lilink actplay">', res.text)
# print(video_list)
# 得到每个视频页面的地址
for video in video_list:
    video_url = 'https://www.pearvideo.com/' + video
    # print(video_url)
    # # 该页面又向后端发送了 Ajax请求 https://www.pearvideo.com/videoStatus.jsp?contId=1760940&mrd=0.07016044407010624
    # res_video = requests.get(video_url)
    # print(res_video.text)  # 拿到的没有页面中 mp4 的url信息 该文章已下线

    # 第一层反爬需要加 Referer
    video_id = video_url.split('_')[-1]
    header = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36",
        "Referer": video_url
    }
    res_video = requests.get(f"https://www.pearvideo.com/videoStatus.jsp?contId={video_id}", headers=header).json()
    mp4_url = res_video.get("videoInfo").get("videos").get("srcUrl")

    # 第二层反爬 需要将 url 变形
    # https://video.pearvideo.com/mp4/short/20220509/  1652081176661  -15876181-hd.mp4
    # https://video.pearvideo.com/mp4/short/20220509/  cont-1760940   -15876181-hd.mp4
    mp4_url = mp4_url.replace(mp4_url.split("/")[-1].split("-")[0], f'cont-{video_id}')
    # print(mp4_url)
    print(mp4_url)
    # 保存本地
    video_detail = requests.get(mp4_url)
    with open(f'./video/{video_id}.mp4', "wb") as f:
        for line in video_detail.iter_content(512):
            f.write(line)

`BS4`爬汽车之家新闻

# pip3 install beautifulsoup4
# pip3 install lxml
# https://www.autohome.com.cn/news/

# https://www.autohome.com.cn/news/1/#liststart

import json
import requests
from bs4 import BeautifulSoup

news_list = []
info = {"car_new": {"news": news_list}}

res = requests.get("https://www.autohome.com.cn/news/1/#liststart")
# print(res.text)
# html.parser 是 bs4 的默认解析库
soup = BeautifulSoup(res.text, 'html.parser')
ul_list = soup.find_all(name='ul', class_='article')
# print(len(ul_list))
for ul in ul_list:
    li_list = ul.find_all(name='li')
    # print(li_list)
    for li in li_list:
        h3 = li.find(name='h3')
        if h3:
            title = h3.text
            desc = li.find(name='p').text
            img = li.find(name='img')['src']
            if not img.startswith("http"):
                img = "https://" + img
            url = "https:" + li.find(name='a')['href']
            news_dict = {
                "新闻标题": title,
                "新闻摘要": desc,
                "新闻图片": img,
                "新闻地址": url,
            }
            news_list.append(news_dict)
            res_img=requests.get(img)
            img_name=img.split('/')[-1]
            with open('./img/%s'%img_name,'wb') as f:
                for line in res_img.iter_content(1024):
                    f.write(line)
            # 可以将信息保存至数据库中

with open("news.json", "a") as f:
    json.dump(info, f, ensure_ascii=False)

`BS4`遍历文档树

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>

<p class="title" id="id_p"> test <b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

# html.parser 内置的，速度一般,容错能力强
# soup = BeautifulSoup(html_doc, 'html.parser')

# lxml    第三方，速度快,容错能力强
# pip3 install lxml
soup = BeautifulSoup(html_doc, 'lxml')

# print(soup.prettify())  # 会对 html 进行美化 将对标签补齐等

# 遍历文档树 . 遍历 速度快
# print(soup.title)
# print(soup.body)
# print(soup.body.p)
# print(soup.body.p.b)
# print(soup.body.a)

# 获取标签名称
# print(soup.title.name)
# print(soup.body.name)
# print(soup.p.name)

# 获取标签属性
# print(soup.body.p)
# print(soup.body.p['class'])  # ['title'] 因为 class 可能有多个 所以是列表
# print(soup.body.p['id'])  # id_p  全局唯一
# print(soup.body.p.attrs)  # {'class': ['title'], 'id': 'id_p'} 所有属性放到字典中

# 获取标签的文本内容
# print(soup.p.text)  # 将标签中的文本内容和所有子标签的内容拼接在一起
# print(soup.p.string)  # 当前标签只有文本或只有一个子有文本才拿出来，如果有其他标签，返回None
# print(list(soup.p.strings))  # 将文本内容和子标签中的文本内容放到 generator

# 嵌套选择
# print(soup.html.head)
# print(soup.head.title.string)  # 可以一直 . 下去

# 子节点 子孙节点
# print(soup.p.contents)  # p下所有子节点，放到列表中
# print(list(soup.p.children))  # 得到一个迭代器,包含p下所有子节点,跟contents本质一样，只是节约内存
# print(list(soup.p.descendants))  # 获取子孙节点,p下所有的标签都会选择出来  子子孙孙
# for i, child in enumerate(soup.p.children):
#     print(i, child)
# for i, child in enumerate(soup.p.descendants):
#     print(i, child)

# 父节点 祖先节点
# print(soup.a.parent)  # p 标签
# print(list(soup.a.parents))  # p body html 等

# 兄弟节点
# print(soup.a.next_sibling)  # 下一个兄弟
# print(soup.a.previous_sibling)  # 上一个兄弟
# print(list(soup.a.next_siblings))  # 下面的兄弟们=>生成器对象
# print(list(soup.a.previous_siblings))  # 上面的兄弟们=>生成器对象

'''
重点记住
.                     遍历
[] attrrs.get()       取属性 
text string strings   取文本
'''

`BS4`搜索文档树

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>

<p class="title" id="id_p"> test <b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

soup = BeautifulSoup(html_doc, 'lxml')

# 五种过滤器 字符串 正则表达式 列表 True 方法
# find      找第一个
# find_all  找所有
# res = soup.find_all(name='p')
# res = soup.find(id="id_p")
# res = soup.find_all(class_="story")
# res = soup.find_all(name='p', class_='story')  # and 条件
# res = soup.find(name='a', id='link2').text
# res = soup.find(name='a', id='link2').attrs.get("href")  # attrs 将 属性: 值的字典
# res = soup.find(attrs={'id': 'link2', 'class': 'sister'}).attrs.get('href')  # 上面的第二种写法
# print(res)


# 正则表达式 value 是正则表达式
# import re
# res = soup.find_all(name=re.compile('^b'))  # 以 b 开头的 标签
# res = soup.find_all(href=re.compile('^http'))  # href 以 http 开头的
# res = soup.find_all(class_=re.compile('^s'))  # class 以 s 开头的
# print(res)


# 列表 value 值是列表
# res = soup.find_all(name=['body', 'a'])
# res = soup.find_all(class_=['sister', 'story'])
# res = soup.find_all(id=['link2', 'link3'])
# print(res)

# True value 值是True
# res = soup.find_all(name=True)
# res = soup.find_all(id=True)
# res = soup.find_all(href=True)
# print(res)

# 方法
# def has_class_but_no_id(tag):
#     return tag.has_attr('class') and not tag.has_attr('id')
# print(soup.find_all(name=has_class_but_no_id))  # 有class但是没有id的标签


# html页面中，只要有的东西，通过bs4都可以解析出来
# 遍历文档树+搜索文档树混用
# def has_class_but_no_id(tag):
#     return tag.has_attr('class') and not tag.has_attr('id')
# print(soup.find(name=has_class_but_no_id).a.text)


# find_all的其他参数limit:限制取几条  recursive：是否递归查找
# def has_class_but_no_id(tag):
#     return tag.has_attr('class') and not tag.has_attr('id')
# res = soup.find_all(name=has_class_but_no_id, limit=1)
# print(res)
# res = soup.find_all(name='a', recursive=False)  # 不递归查找,速度快，只找一层
# print(res)

`CSS`选择器

# css xpath选择器是通用的，基本所有的解析库(bs4,lxml,pyquery,selenium的解析库)，都支持css选择器，css在前端通用

from bs4 import BeautifulSoup

html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>

<p class="title" id="id_p"> test <b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, 'lxml')
'''
div      找div标签
div>a    找div下的紧邻的a
div a    找div下的子子孙孙的a
.sister  找类名为sister的标签
#id_p    找id为id_p的标签
'''

# soup.select()  # 找所有
# soup.select_one()  # 找一个

# res = soup.select('p')  # 找所有的 p 标签      
# res = soup.select('#id_p')  # 找 id 是 id_p 的标签
# res = soup.select('.sister')  # 找类是 sister 的标签
# res = soup.select_one('.story>a').attrs.get('href')  # .story 类下的 a 标签 生成字典形式 获取 href
# print(res)

# 终极大招 CV 浏览器中 找到需要查找的地方 选择Copy ---> Copy selector
import requests
response=requests.get('https://www.runoob.com/cssref/css-selectors.html')
soup=BeautifulSoup(response.text,'lxml')
res=soup.select_one('#content > table > tbody > tr:nth-child(2) > td:nth-child(3)').text
print(res)


# 只要页面中有的通过bs4都能解析出来