解析器 使用方法 优势 劣势
Python标准库 BeautifulSoup(markup,"html.parser") Python的内置标准库执行速度适中文档容错能力强 Python 2.7.3 or 3.2.2)前 的版本中文档容错能力差
lxml HTML 解析器 BeautifulSoup(markup,"lxml") 速度快文档容错能力强 需要安装C语言库
lxml XML 解析器 BeautifulSoup(markup,["lxml","xml"])BeautifulSoup(markup,"xml") 速度快唯一支持XML的解析器 需要安装C语言库
html5lib BeautifulSoup(markup,"html5lib") 最好的容错性以浏览器的方式解析文档生成HTML5格式的文档 速度慢不依赖外部扩展

简介

Beautiful Soup是一个可以从HTML或XML文件中提取数据的Python库.它能够通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式.Beautiful Soup会帮你节省数小时甚至数天的工作时间.

中文文档地址

https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html

安装BS4

pip install bs4

基本用法

#!/usr/bin/env python
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import requests

soup = BeautifulSoup(open("1.html", "r"), "lxml")
content = requests.get("http://www.ivsky.com/tupian/ziranfengguang/").text
soup = BeautifulSoup(content, "lxml")
# 按照标准的缩进格式的结构输出
# print(soup.prettify())
# 第一个title标签
print(soup.title)
# name:标签名
print(soup.title.name)
# string:标签间的文本内容
print(soup.title.string)
# parent:上级标签
print(soup.title.parent.name)
# 第一个img标签信息
print(soup.img)
# [属性名]
print(soup.img['src'])
# 查询网页内所有img标签,返回值是列表
print(soup.find_all('img'))
# 查询class为tpmenu的标签信息
ul = soup.find(attrs={
    "class": "tpmenu"
})
print(ul)
# 查询ul中每个li里a的href属性内容
for li in ul:
    print(li.a['href'])
    # 获取元素下所有文本信息
    print(li.get_text())

练习

使用BeautifulSoup解析天堂图片网所有分类下的图片
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin


class IvskyManager(object):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:49.0) Gecko/20100101 Firefox/49.0",
    }



    @classmethod
    def parse_big_cate(cls, url):
        response = requests.get(url, headers=cls.headers)
        response.encoding = response.apparent_encoding
        soup = BeautifulSoup(response.text, "lxml")
        ul_tag = soup.find("ul", attrs={"class": "tpmenu",})
        for li in ul_tag:
            big_cate_name = li.a.string
            big_cate_href = li.a['href']
            big_cate_href = urljoin(url, big_cate_href)
            print("分类名称是:{}, 分类地址是:{}".format(big_cate_name, big_cate_href))
            cls.parse_small_cate(big_cate_href)
            break

    @classmethod
    def parse_small_cate(cls, url):
        response = requests.get(url, headers=cls.headers)
        response.encoding = response.apparent_encoding
        soup = BeautifulSoup(response.text, "lxml")
        div_tag = soup.find("div", attrs={"class": "sline"}).div
        a_tag = div_tag.find_all("a")
        for a in a_tag:
            small_cate_name = a.string
            small_cate_href = a['href']
            small_cate_href = urljoin(url, small_cate_href)
            print("--------分类名称是:{}, 分类地址是:{}".format(small_cate_name, small_cate_href))
            cls.parse_all_pages(small_cate_href)
            break

    @classmethod
    def parse_page_imgs(cls, url):
        response = requests.get(url, headers=cls.headers)
        response.encoding = response.apparent_encoding
        soup = BeautifulSoup(response.text, "lxml")
        ul_tag = soup.find("ul", attrs={"class": "pli"})
        if not ul_tag:
            return False
        li_tag = ul_tag.find_all("li")
        for li in li_tag:
            detail_href = li.div.a['href']
            detail_href = urljoin(url, detail_href)
            img_src = li.div.a.img['src']
            img_alt = li.div.a.img['alt']
            print("图片缩略图名称:{},图片缩略图地址:{},图片详情地址:{}".format(img_alt, img_src, detail_href))
            cls.parse_detail_img(detail_href)

        next_page = soup.find("a", attrs={"class": "page-next"})
        print(next_page)
        return True if next_page else False
    @classmethod
    def parse_detail_img(cls, url):
        response = requests.get(url, headers=cls.headers)
        response.encoding = response.apparent_encoding
        soup = BeautifulSoup(response.text, "lxml")
        detail_src = soup.find("img", attrs={"id":"imgis"})['src']
        print("图片详情地址为:{}".format(detail_src))

    @classmethod
    def parse_all_pages(cls, url):
        page = 199
        while True:
            page += 1
            next_url = url + "index_{}.html".format(page)
            result = cls.parse_page_imgs(next_url)
            if not result:
                break

    @classmethod
    def init_db(cls):
        pass

    @classmethod
    def insert_imgs(cls):
        pass

if __name__ == '__main__':
    IvskyManager.parse_big_cate("http://www.ivsky.com/tupian/ziranfengguang/")

results matching ""

    No results matching ""