古诗文

http://www.gushiwen.org/shiwen/
string(.//div[@class='contson'])

伯乐在线

URL:http://blog.jobbole.com
API:http://blog.jobbole.com/all-posts/page/页码/
是否需要登录:否
数据返回格式:HTML
组成：爬虫部分 + 搜索部分
可以通过pyinstaller -F ***.py把文件进行打包

# -*- coding: utf-8 -*-
import requests
from lxml import etree
for page in range(1,554):
    url = "http://blog.jobbole.com/all-posts/page/%s/"%page
    print(url)
    response = requests.get(url)
    root = etree.HTML(response.content)
    div_list = root.xpath("//div[@class='grid-8']/div[@class='post floated-thumb']")

    for div in div_list:
        bole_title = div.xpath("div[@class='post-meta']/p/a[@class='archive-title']/@title")
        bole_title = bole_title[0] if bole_title else "没有标题"
        bole_title = bole_title.strip().replace(",","，")

        bole_url = div.xpath("div[@class='post-meta']/p/a[@class='archive-title']/@href")
        bole_url = bole_url[0] if bole_url else "没有网址"
        bole_url = bole_url.strip()
        if not bole_url.startswith("http"):
            bole_url = "" + bole_url
            print("网址不完整",bole_url)

        bole_date = div.xpath("div[@class='post-meta']/p/text()")
        bole_date = "".join(bole_date).replace("·","").replace(",","").strip()

        bole_cate = div.xpath("div[@class='post-meta']/p/a[@rel='category tag']/text()")
        bole_cate = ".".join(bole_cate)

        bole_img_src = div.xpath("div[@class='post-thumb']/a/img/@src")
        bole_img_src = bole_img_src[0].strip() if bole_img_src else "没有图片地址"
        if not bole_img_src.startswith("http") and "没有图片地址" not in bole_img_src:
            bole_img_src = "http://blog.jobbole.com"+ bole_img_src

        bole_info = [bole_title,bole_date,bole_cate,bole_img_src,bole_url]
        f = open("bole.csv","a",encoding="gb18030")
        f.write(",".join(bole_info)+"\n")
        f.close()

# -*- coding: utf-8 -*-
keyword = input("请输入搜索关键字(回车退出)")
keyword = keyword.lower()
f = open("bole.csv","r",encoding="gb18030")
lines = f.readlines()
f.close()
while keyword:

    for line in lines:
        infos = line.split(",")
        bole_title = infos[0]
        bole_url = infos[4]
        if keyword in bole_title.lower():
            print(bole_title,bole_url)

    keyword = input("请输入搜索关键字(回车退出)")
    keyword = keyword.lower()

print("欢迎再次使用本系统(按任意键退出)")

堆糖网

URL:https://www.duitang.com/
API:https://www.duitang.com/napi/blog/list/by_search/?kw=美女&start=24
是否需要登录:否
数据返回格式:JSON

# -*- coding:utf-8 -*-
# __author__ = u'张泽楠'
# __date__ = '2018/2/28 23:48'
# 说明:代码很垃圾,核心知识点是多线程和线程锁
import requests
import threading
import os

# 设置最大线程锁
thread_lock = threading.BoundedSemaphore(value=10)


def get_page(url):
    page = requests.get(url)
    page = page.content
    page = page.decode("utf-8")
    return page


def findall_in_page(page, startpart, endpart):
    all_strings = []
    end = 0
    while page.find(startpart, end) != -1:
        start = page.find(startpart, end) + len(startpart)
        end = page.find(endpart, start)
        string = page[start:end]
        all_strings.append(string)
    return all_strings


def page_from_duitang(label):
    pages = []
    url = "https://www.duitang.com/napi/blog/list/by_search/?kw={}&start={}"
    for index in range(0, 3600, 100):
        u = url.format(label, index)
        print(u)
        page = get_page(u)
        pages.append(page)
    return pages


def pic_urls_from_pages(pages):
    pic_urls = []
    for page in pages:
        urls = findall_in_page(page, 'path":"', '"')
        pic_urls.extend(urls)
    return pic_urls


def download_pics(url, n):
    r = requests.get(url)
    if not os.path.exists("pics"):
        os.makedirs("pics")
    path = 'pics/' + str(n) + '.jpg'
    with open(path, "wb") as f:
        f.write(r.content)
    thread_lock.release()


def main(label):
    pages = page_from_duitang(label)
    pic_urls = pic_urls_from_pages(pages)
    for idx, url in enumerate(pic_urls):
        print("正在下载第{}张图片".format(idx))
        thread_lock.acquire()
        t = threading.Thread(target=download_pics, args=(url, idx))
        t.start()


if __name__ == '__main__':
    main("美女")

# -*- coding: utf-8 -*-
import requests
import json
import os  #os=operationSystem=操作系统
while True:
    kw = input("请输入搜索关键字（换行退出）：")
    if not kw:
        print("退出搜索")
        break
    path = "imgs/"+kw
    if not os.path.exists(path):   
        os.makedirs(path)
    for page in range(1,2):
        #"%s--%s"%(变量1，变量2)
        url = "https://www.duitang.com/napi/blog/list/by_search/?kw=%s&start=%s"%(kw, (page-1)*24)
        print(url)
        response = requests.get(url)
        # 先把json字符串转化为json对象
        #load:加载
        #json_obg=｛｝
        json_obg = json.loads(response.content)
        more = json_obg["data"]["more"]

        object_list = json_obg["data"]["object_list"]
        for obj in object_list:
            img_src = obj["photo"]["path"]
            img_response = requests.get(img_src)
            img_name = img_src.split("/")[-1]
            #print(img_src)
            file = open(path+"/"+img_name,"wb")
            file.write(img_response.content)
            file.close()
        if more==0:
            print("没有更多数据了")
            break

抖音APP视频下载

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
from contextlib import closing
import requests, json, time, re, os, sys


class DouYin(object):
    def __init__(self):
        """
        抖音App视频下载
        """
        # SSL认证
        pass

    def get_video_urls(self, user_id):
        """
        获得视频播放地址
        Parameters:
            nickname：查询的用户名
        Returns:
            video_names: 视频名字列表
            video_urls: 视频链接列表
            aweme_count: 视频数量
        """
        video_names = []
        video_urls = []
        unique_id = ''
        while unique_id != user_id:
            search_url = 'https://api.amemv.com/aweme/v1/discover/search/?cursor=0&keyword=%s&count=10&type=1&retry_type=no_retry&iid=17900846586&device_id=34692364855&ac=wifi&channel=xiaomi&aid=1128&app_name=aweme&version_code=162&version_name=1.6.2&device_platform=android&ssmix=a&device_type=MI+5&device_brand=Xiaomi&os_api=24&os_version=7.0&uuid=861945034132187&openudid=dc451556fc0eeadb&manifest_version_code=162&resolution=1080*1920&dpi=480&update_version_code=1622' % user_id
            req = requests.get(url=search_url, verify=False)
            html = json.loads(req.text)
            aweme_count = html['user_list'][0]['user_info']['aweme_count']
            uid = html['user_list'][0]['user_info']['uid']
            nickname = html['user_list'][0]['user_info']['nickname']
            unique_id = html['user_list'][0]['user_info']['unique_id']
        user_url = 'https://www.douyin.com/aweme/v1/aweme/post/?user_id=%s&max_cursor=0&count=%s' % (uid, aweme_count)
        req = requests.get(url=user_url, verify=False)
        html = json.loads(req.text)
        i = 1
        for each in html['aweme_list']:
            share_desc = each['share_info']['share_desc']
            if '抖音-原创音乐短视频社区' == share_desc:
                video_names.append(str(i) + '.mp4')
                i += 1
            else:
                video_names.append(share_desc + '.mp4')
            video_urls.append(each['share_info']['share_url'])

        return video_names, video_urls, nickname

    def get_download_url(self, video_url):
        """
        获得视频播放地址
        Parameters:
            video_url：视频播放地址
        Returns:
            download_url: 视频下载地址
        """
        req = requests.get(url=video_url, verify=False)
        bf = BeautifulSoup(req.text, 'lxml')
        script = bf.find_all('script')[-1]
        video_url_js = re.findall('var data = \[(.+)\];', str(script))[0]
        video_html = json.loads(video_url_js)
        download_url = video_html['video']['play_addr']['url_list'][0]
        return download_url

    def video_downloader(self, video_url, video_name):
        """
        视频下载
        Parameters:
            None
        Returns:
            None
        """
        size = 0
        with closing(requests.get(video_url, stream=True, verify=False)) as response:
            chunk_size = 1024
            content_size = int(response.headers['content-length'])
            if response.status_code == 200:
                sys.stdout.write('  [文件大小]:%0.2f MB\n' % (content_size / chunk_size / 1024))

                with open(video_name, "wb") as file:
                    for data in response.iter_content(chunk_size=chunk_size):
                        file.write(data)
                        size += len(data)
                        file.flush()

                    sys.stdout.write('    [下载进度]:%.2f%%' % float(size / content_size * 100))
                    sys.stdout.flush()
        time.sleep(1)

    def run(self):
        """
        运行函数
        Parameters:
            None
        Returns:
            None
        """
        self.hello()
        # user_id = input('请输入ID(例如13978338):')
        user_id = 'sm666888'
        video_names, video_urls, nickname = self.get_video_urls(user_id)
        if nickname not in os.listdir():
            os.mkdir(nickname)
        sys.stdout.write('视频下载中:\n')
        for num in range(len(video_urls)):
            print('  %s\n' % video_urls[num])
            video_url = self.get_download_url(video_urls[num])
            if '\\' in video_names[num]:
                video_name = video_names[num].replace('\\', '')
            elif '/' in video_names[num]:
                video_name = video_names[num].replace('/', '')
            else:
                video_name = video_names[num]
            self.video_downloader(video_url, os.path.join(nickname, video_name))
            print('')

    def hello(self):
        """
        打印欢迎界面
        Parameters:
            None
        Returns:
            None
        """
        print('*' * 100)
        print('\t\t\t\t抖音App视频下载小助手')
        print('*' * 100)


if __name__ == '__main__':
    douyin = DouYin()
    douyin.run()

帅啊网

# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
import requests
import os
import time

if __name__ == '__main__':
    list_url = []
    for num in range(1,3):
        if num == 1:
            url = 'http://www.shuaia.net/index.html'
        else:
            url = 'http://www.shuaia.net/index_%d.html' % num
        headers = {
                "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
        }
        req = requests.get(url = url,headers = headers)
        req.encoding = 'utf-8'
        html = req.text
        bf = BeautifulSoup(html, 'lxml')
        targets_url = bf.find_all(class_='item-img')

        for each in targets_url:
            list_url.append(each.img.get('alt') + '=' + each.get('href'))

    print('连接采集完成')

    for each_img in list_url:
        img_info = each_img.split('=')
        target_url = img_info[1]
        filename = img_info[0] + '.jpg'
        print('下载：' + filename)
        headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
        }
        img_req = requests.get(url = target_url,headers = headers)
        img_req.encoding = 'utf-8'
        img_html = img_req.text
        img_bf_1 = BeautifulSoup(img_html, 'lxml')
        img_url = img_bf_1.find_all('div', class_='wr-single-content-list')
        img_bf_2 = BeautifulSoup(str(img_url), 'lxml')
        img_url = 'http://www.shuaia.net' + img_bf_2.div.img.get('src')
        if 'images' not in os.listdir():
            os.makedirs('images')
        urlretrieve(url = img_url,filename = 'images/' + filename)
        time.sleep(1)

    print('下载完成！')

笔趣网小说下载

# -*- coding:UTF-8 -*-
from urllib import request
from bs4 import BeautifulSoup
import collections
import re
import os
import time
import sys
import types

"""
类说明:下载《笔趣看》网小说: url:http://www.biqukan.com/

Parameters:
    target - 《笔趣看》网指定的小说目录地址(string)

Returns:
    无

Modify:
    2017-05-06
"""
class download(object):
    def __init__(self, target):
        self.__target_url = target
        self.__head = {'User-Agent':'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19',}

    """
    函数说明:获取下载链接

    Parameters:
        无

    Returns:
        novel_name + '.txt' - 保存的小说名(string)
        numbers - 章节数(int)
        download_dict - 保存章节名称和下载链接的字典(dict)

    Modify:
        2017-05-06
    """
    def get_download_url(self):
        charter = re.compile(u'[第弟](.+)章', re.IGNORECASE)
        target_req = request.Request(url = self.__target_url, headers = self.__head)
        target_response = request.urlopen(target_req)
        target_html = target_response.read().decode('gbk','ignore')
        listmain_soup = BeautifulSoup(target_html,'lxml')
        chapters = listmain_soup.find_all('div',class_ = 'listmain')
        download_soup = BeautifulSoup(str(chapters), 'lxml')
        novel_name = str(download_soup.dl.dt).split("》")[0][5:]
        flag_name = "《" + novel_name + "》" + "正文卷"
        numbers = (len(download_soup.dl.contents) - 1) / 2 - 8
        download_dict = collections.OrderedDict()
        begin_flag = False
        numbers = 1
        for child in download_soup.dl.children:
            if child != '\n':
                if child.string == u"%s" % flag_name:
                    begin_flag = True
                if begin_flag == True and child.a != None:
                    download_url = "http://www.biqukan.com" + child.a.get('href')
                    download_name = child.string
                    names = str(download_name).split('章')
                    name = charter.findall(names[0] + '章')
                    if name:
                            download_dict['第' + str(numbers) + '章 ' + names[1]] = download_url
                            numbers += 1
        return novel_name + '.txt', numbers, download_dict

    """
    函数说明:爬取文章内容

    Parameters:
        url - 下载连接(string)

    Returns:
        soup_text - 章节内容(string)

    Modify:
        2017-05-06
    """
    def Downloader(self, url):
        download_req = request.Request(url = url, headers = self.__head)
        download_response = request.urlopen(download_req)
        download_html = download_response.read().decode('gbk','ignore')
        soup_texts = BeautifulSoup(download_html, 'lxml')
        texts = soup_texts.find_all(id = 'content', class_ = 'showtxt')
        soup_text = BeautifulSoup(str(texts), 'lxml').div.text.replace('\xa0','')
        return soup_text

    """
    函数说明:将爬取的文章内容写入文件

    Parameters:
        name - 章节名称(string)
        path - 当前路径下,小说保存名称(string)
        text - 章节内容(string)

    Returns:
        无

    Modify:
        2017-05-06
    """
    def Writer(self, name, path, text):
        write_flag = True
        with open(path, 'a', encoding='utf-8') as f:
            f.write(name + '\n\n')
            for each in text:
                if each == 'h':
                    write_flag = False
                if write_flag == True and each != ' ':
                    f.write(each)
                if write_flag == True and each == '\r':
                    f.write('\n')            
            f.write('\n\n')

if __name__ == "__main__":

    #小说地址
    target_url = "http://www.biqukan.com/0_784/"

    #实例化下载类
    d = download(target = target_url)
    name, numbers, url_dict = d.get_download_url()
    if name in os.listdir():
        os.remove(name)
    index = 1

    #下载中
    print("《%s》下载中:" % name[:-4])
    for key, value in url_dict.items():
        d.Writer(key, name, d.Downloader(value))
        sys.stdout.write("已下载:%.1f%%" %  float(index/numbers*100) + '\r\n')
        sys.stdout.flush()
        index += 1    

    print("《%s》下载完成！" % name[:-4])

小说阅读网

https://www.readnovel.com/
爬完整个免费小说大致需要200天

# -*- coding: utf-8 -*-
import requests, os, time
from lxml import etree
from fake_useragent import UserAgent # 手动安装  pip install fake_useragent
agent = UserAgent()
# 创建文件夹，小说都放进去方便管理
os.makedirs("novel", exist_ok=True)
url = "https://www.readnovel.com/free/all?pageSize=10&gender=2&catId=-1&isFinish=-1&isVip=1&size=-1&updT=-1&orderBy=0&pageNum=2"
response = requests.get(url, headers={"User-Agent": agent.random})
root = etree.HTML(response.content)
novel_list = root.xpath("//div[@class='right-book-list']/ul/li")
for novel in novel_list:
    novel_title = novel.xpath("div/h3/a/@title")[0]
    novel_title = novel_title.strip()
    novel_href = novel.xpath("div/h3/a/@href")[0]
    if not novel_href.startswith("http"):
        novel_href = "https://www.readnovel.com" + novel_href
    novel_detail_response = requests.get(novel_href, headers={"User-Agent": agent.random})
    novel_detail_root = etree.HTML(novel_detail_response.content)
    novel_detail_href = novel_detail_root.xpath("//a[text()='免费试读']/@href")[0]
    if not novel_detail_href.startswith("http"):
        novel_detail_href = "https:" + novel_detail_href
    while True:
        novel_detail_response = requests.get(novel_detail_href, headers={"User-Agent": agent.random})
        novel_detail_root = etree.HTML(novel_detail_response.content)
        chapter_name = novel_detail_root.xpath("//h3/text()")[0]
        chapter_name = chapter_name.strip()
        novel_content = novel_detail_root.xpath("//div[@class='read-content j_readContent']/p/text()")
        if not novel_content:
            break
        file = open("novel/"+novel_title+".txt", "a", encoding="gb18030")
        file.write(chapter_name+"\n")
        file.close()
        print(chapter_name)
        for content in novel_content:
            content = "    " + content.strip()
            file = open("novel/"+novel_title+".txt", "a", encoding="gb18030")
            file.write(content+"\n")
            file.close()
        novel_detail_href = novel_detail_root.xpath("//a[text()='下一章']/@href")
        if not novel_detail_href:
            print("没有了")
            break
        else:
            novel_detail_href = novel_detail_href[0]
        if not novel_detail_href.startswith("http"):
            novel_detail_href = "https:" + novel_detail_href
        # 放慢爬虫步伐，防止因为太快被发现
        time.sleep(2)

纵横中文网(下载所有免费小说)

http://www.zongheng.com/

站长素材网

http://sc.chinaz.com/yinxiao/

微博

喜马拉雅

全书小说网

URL:http://www.quanshuwang.com/
API:http://www.quanshuwang.com/list/分类编号_页码编号.html
数据返回格式:HTML

#!/usr/bin/env python
# -*- coding:utf-8 -*-

""" 
@author: 张泽楠 
@contact: [email protected] 
@file: main.py 
@time: 2018/3/9 20:26 
"""
# -*- coding: utf-8 -*-
import requests
from lxml import etree

for page in range(1, 940):
    url = "http://www.quanshuwang.com/list/1_%s.html" % page
    response = requests.get(url)
    response.encoding = response.apparent_encoding
    root = etree.HTML(response.content)
    li_list = root.xpath("//ul[@class='seeWell cf']/li")
    for li in li_list:
        novel_title = li.xpath("span/a[@class='clearfix stitle']/@title")
        novel_title = novel_title[0] if novel_title else "没有小说名字"

        novel_user = li.xpath("span/a[2]/text()")
        novel_user = novel_user[0] if novel_user else "没有作者"

        novel_img_src = li.xpath("a/img/@src")
        novel_img_src = novel_img_src[0] if novel_img_src else "没有图片地址"
        print(novel_title, novel_user, novel_img_src)

飞卢小说网

URL:https://b.faloo.com
API:https://b.faloo.com/l/0/0/0/4/0/3/页码.html
数据返回格式:HTML
注意事项：

校花网

URL:http://www.521609.com/
API:http://www.521609.com/daxuexiaohua/list3页码.html
是否需要登录:否
数据返回格式:HTML

豆瓣电影Top250

URL:https://movie.douban.com
API:https://movie.douban.com/top250?start=(页码-1)*25&filter=
是否需要登录:否
数据返回格式:HTML

# -*- coding: utf-8 -*-
import requests
from lxml import etree

#3-50 4-75 5-100 n对应（n-1）*25
for page in range(1,2): 
    #"ZHANGSAN"*10表示字符串重复10次
    url = "https://movie.douban.com/top250?start=%s&filter="%((page-1)*25)
    response = requests.get(url)
    root = etree.HTML(response.content, parser=etree.HTMLParser(encoding='utf-8'))
    films = root.xpath("//ol[@class='grid_view']/li/div[@class='item']")     
    for film in films:
        film_name = film.xpath("div[@class='info']/div[@class='hd']/a/span/text()")

        #new_film_name = ""
        #for name in film_name:
        #    new_film_name = new_film_name + name
        #以上三句等价于
        #join:加入/拼接
        #将film_name中的每个元素按照空字符串拼接

        # 这里需要再次split然后拼接,因为标题中有"不间断空格",存起来是乱码
        new_film_name = "".join(film_name)
        new_film_name = "".join(new_film_name.split())


        film_star = film.xpath("div[@class='info']/div[@class='bd']/div[@class='star']/span[@class='rating_num']/text()")
        film_star = film_star[0] if film_star else "0.0"
        film_play = film.xpath("div[@class='info']/div[@class='hd']/span/text()")
        film_play = film_play[0] if film_play else "不可播放"
        #char：字符 chars：多个字符
        film_play = film_play.strip("[]")

        film_info = [new_film_name,film_star,film_play]
        file = open("film.csv","a",encoding="gb18030")
        info = ",".join(film_info)
        file.write(info+"\n")
        file.close()

# coding:utf-8
# __author__ = 'Gao'

# fake-useragent: 第三库User-Agent模块，它提供了最新的，最全面的User-Agent浏览器标识，支持谷歌，火狐，IE，Opera等主流浏览器的User-Agent值。
# 安装方法：pip install fake-useragent

import csv, codecs
import re

import requests
# 导入用于随机User-Agent值的第三方库
from fake_useragent import UserAgent
from lxml import etree


class DBMovie(object):
    def __init__(self):
        self.base_url = 'https://movie.douban.com/top250'
        self.ua = UserAgent()
        self.html_obj = None

    def get_page_code(self, url=''):
        """
        根据url获取网页源代码
        :param url: 从页面中的下一页标签中提取的相对地址 ?start=20&filter=
        :return:
        """
        # 拼接每一页的完整地址
        abs_url = self.base_url + url
        headers = {
            'User-Agent':self.ua.random
        }
        content = requests.get(abs_url, headers=headers).content
        self.html_obj = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8'))

        self.get_content_by_xpath(self.html_obj)
        # self.get_content_by_css(self.html_obj)

    def get_content_by_xpath(self, html_obj):
        """
        根据每一页的文档对象Element，使用xpath/cssselect取出相关信息。
        :param html_obj: 接收的某一页的根文档对象
        :return:
        """
        movie_list = []
        item_div = html_obj.xpath('//div[@class="item"]')
        for item_tag in item_div:
            movie_dict = {}
            # 获取em标签内部的电影排名
            em = item_tag.xpath('.//em/text()')[0]
            # 获取电影的简要信息
            hd = item_tag.xpath('.//div[@class="hd"]/a/span/text()')
            # 将hd中的三个信息拼接成一个字符串
            info = ''
            for info_text in hd:
                content = info_text.strip('\n').strip()
                info += content
            # 获取电影的详细信息
            # 演员介绍
            member_info = item_tag.xpath('.//p[@class=""]/text()')[0].strip('\n').strip()
            # 电影评分
            star_number = item_tag.xpath('.//span[@class="rating_num"]/text()')[0]
            # 电影评论数
            comment_number = item_tag.xpath('.//div[@class="star"]/span[last()]/text()')[0]
            comment_number = re.search(re.compile('(\d+)'), comment_number).group(1)
            # 电影点评
            quote = item_tag.xpath('.//span[@class="inq"]')
            if len(quote) != 0:
                quote = quote[0].xpath('text()')[0]
            else:
                quote = u'影评不存在'

            # 将以上数据添加到字典中
            movie_dict['movie_rank'] = em
            movie_dict['movie_name'] = info
            movie_dict['movie_member'] = member_info
            movie_dict['movie_star'] = star_number
            movie_dict['movie_comment'] = comment_number
            movie_dict['movie_quote'] = quote

            movie_list.append(movie_dict)

        # 将movie_list中的所有字典数据，写入到本地excel文件中
        self.write_movie_info(movie_list)

    def write_movie_info(self, movie_list):
        """
        将当前页的所有电影数据，写入到本地
        :param movie_list: 当前页的所有数据
        :return:
        """
        for movie in movie_list:
            self.writer.writerow(movie)

        # 当前页数据写入完毕，获取下一页的url
        self.get_next_page_url()

    def open_file(self):
        csv_file = open('movie1.csv', 'w', encoding="gb18030")
        self.writer = csv.DictWriter(csv_file,
                                fieldnames=['movie_rank', 'movie_name', 'movie_member', 'movie_star', 'movie_comment',
                                            'movie_quote'])
        self.writer.writeheader()

    def get_next_page_url(self):
        a = self.html_obj.xpath('//span[@class="next"]/a')
        if len(a) == 0:
            print('最后一页了')
            return
        next_page = a[0].xpath('@href')[0]
        self.get_page_code(next_page)

    def get_content_by_css(self, html_obj):
        item_div = html_obj.cssselect('.item')
        for div_tag in item_div:
            # 排名信息
            em = div_tag.cssselect('em')[0].text
            # 电影名称
            name_list = div_tag.cssselect('a>span')
            name = ''
            for name_tag in name_list:
                name_str = name_tag.text.strip().strip('\n')
                name += name_str

            # 演员信息
            bd = div_tag.cssselect('.bd')[0]
            p_list = bd.cssselect('p')
            if len(p_list) == 2:
                # 包含演员信息和影评信息
                member = p_list[0].text
                quote = p_list[1].cssselect('span')[0].text
            else:
                # 只有演员信息，没有影评
                member = p_list[0].text

            # 获取评分及评论数
            star_list = bd.cssselect('.star>span')
            star = star_list[1].text
            comment_number = star_list[3].text
            comment_number = re.search(re.compile('(\d+)'), comment_number).group(1)

            print ('css选择器选择完毕'
                   )

if __name__ == '__main__':
    movie_obj = DBMovie()
    movie_obj.open_file()
    movie_obj.get_page_code()

普通爬虫练习

古诗文

伯乐在线

堆糖网

抖音APP视频下载

帅啊网

笔趣网小说下载

小说阅读网

纵横中文网(下载所有免费小说)

站长素材网

微博

喜马拉雅

全书小说网

飞卢小说网

校花网

豆瓣电影Top250

爬取豆瓣网图书TOP250数据

爬取豆瓣音乐TOP250的数据

爬取酷狗TOP500的数据

微信文章

爬取北京地区短租房信息

爬取《斗破苍穹》全文小说

爬取糗事百科网的段子信息

爬取糗事百科网的用户地址信息

爬取起点中文网小说信息

爬取PEXELS图片

爬取转转网二手市场商品信息

爬取简书网热评文章

爬取简书网用户动态信息

爬取简书网7日热门信息

爬取简书网热门专题信息

爬取简书网专题收录文章

爬取简书网推荐信息

爬取拉勾网招聘信息

爬取新浪微博好友圈信息

爬取QQ空间好友说说

爬取淘宝商品信息

爬取知乎网Python精华话题

results matching ""

No results matching ""