古诗文

http://www.gushiwen.org/shiwen/
string(.//div[@class='contson'])

伯乐在线

URL:http://blog.jobbole.com
API:http://blog.jobbole.com/all-posts/page/页码/
是否需要登录:否
数据返回格式:HTML
组成:爬虫部分 + 搜索部分
可以通过pyinstaller -F ***.py把文件进行打包
# -*- coding: utf-8 -*-
import requests
from lxml import etree
for page in range(1,554):
    url = "http://blog.jobbole.com/all-posts/page/%s/"%page
    print(url)
    response = requests.get(url)
    root = etree.HTML(response.content)
    div_list = root.xpath("//div[@class='grid-8']/div[@class='post floated-thumb']")

    for div in div_list:
        bole_title = div.xpath("div[@class='post-meta']/p/a[@class='archive-title']/@title")
        bole_title = bole_title[0] if bole_title else "没有标题"
        bole_title = bole_title.strip().replace(",",",")

        bole_url = div.xpath("div[@class='post-meta']/p/a[@class='archive-title']/@href")
        bole_url = bole_url[0] if bole_url else "没有网址"
        bole_url = bole_url.strip()
        if not bole_url.startswith("http"):
            bole_url = "" + bole_url
            print("网址不完整",bole_url)

        bole_date = div.xpath("div[@class='post-meta']/p/text()")
        bole_date = "".join(bole_date).replace("·","").replace(",","").strip()

        bole_cate = div.xpath("div[@class='post-meta']/p/a[@rel='category tag']/text()")
        bole_cate = ".".join(bole_cate)

        bole_img_src = div.xpath("div[@class='post-thumb']/a/img/@src")
        bole_img_src = bole_img_src[0].strip() if bole_img_src else "没有图片地址"
        if not bole_img_src.startswith("http") and "没有图片地址" not in bole_img_src:
            bole_img_src = "http://blog.jobbole.com"+ bole_img_src

        bole_info = [bole_title,bole_date,bole_cate,bole_img_src,bole_url]
        f = open("bole.csv","a",encoding="gb18030")
        f.write(",".join(bole_info)+"\n")
        f.close()
# -*- coding: utf-8 -*-
keyword = input("请输入搜索关键字(回车退出)")
keyword = keyword.lower()
f = open("bole.csv","r",encoding="gb18030")
lines = f.readlines()
f.close()
while keyword:

    for line in lines:
        infos = line.split(",")
        bole_title = infos[0]
        bole_url = infos[4]
        if keyword in bole_title.lower():
            print(bole_title,bole_url)

    keyword = input("请输入搜索关键字(回车退出)")
    keyword = keyword.lower()

print("欢迎再次使用本系统(按任意键退出)")

堆糖网

URL:https://www.duitang.com/
API:https://www.duitang.com/napi/blog/list/by_search/?kw=美女&start=24
是否需要登录:否
数据返回格式:JSON
# -*- coding:utf-8 -*-
# __author__ = u'张泽楠'
# __date__ = '2018/2/28 23:48'
# 说明:代码很垃圾,核心知识点是多线程和线程锁
import requests
import threading
import os

# 设置最大线程锁
thread_lock = threading.BoundedSemaphore(value=10)


def get_page(url):
    page = requests.get(url)
    page = page.content
    page = page.decode("utf-8")
    return page


def findall_in_page(page, startpart, endpart):
    all_strings = []
    end = 0
    while page.find(startpart, end) != -1:
        start = page.find(startpart, end) + len(startpart)
        end = page.find(endpart, start)
        string = page[start:end]
        all_strings.append(string)
    return all_strings


def page_from_duitang(label):
    pages = []
    url = "https://www.duitang.com/napi/blog/list/by_search/?kw={}&start={}"
    for index in range(0, 3600, 100):
        u = url.format(label, index)
        print(u)
        page = get_page(u)
        pages.append(page)
    return pages


def pic_urls_from_pages(pages):
    pic_urls = []
    for page in pages:
        urls = findall_in_page(page, 'path":"', '"')
        pic_urls.extend(urls)
    return pic_urls


def download_pics(url, n):
    r = requests.get(url)
    if not os.path.exists("pics"):
        os.makedirs("pics")
    path = 'pics/' + str(n) + '.jpg'
    with open(path, "wb") as f:
        f.write(r.content)
    thread_lock.release()


def main(label):
    pages = page_from_duitang(label)
    pic_urls = pic_urls_from_pages(pages)
    for idx, url in enumerate(pic_urls):
        print("正在下载第{}张图片".format(idx))
        thread_lock.acquire()
        t = threading.Thread(target=download_pics, args=(url, idx))
        t.start()


if __name__ == '__main__':
    main("美女")
# -*- coding: utf-8 -*-
import requests
import json
import os  #os=operationSystem=操作系统
while True:
    kw = input("请输入搜索关键字(换行退出):")
    if not kw:
        print("退出搜索")
        break
    path = "imgs/"+kw
    if not os.path.exists(path):   
        os.makedirs(path)
    for page in range(1,2):
        #"%s--%s"%(变量1,变量2)
        url = "https://www.duitang.com/napi/blog/list/by_search/?kw=%s&start=%s"%(kw, (page-1)*24)
        print(url)
        response = requests.get(url)
        # 先把json字符串转化为json对象
        #load:加载
        #json_obg={}
        json_obg = json.loads(response.content)
        more = json_obg["data"]["more"]

        object_list = json_obg["data"]["object_list"]
        for obj in object_list:
            img_src = obj["photo"]["path"]
            img_response = requests.get(img_src)
            img_name = img_src.split("/")[-1]
            #print(img_src)
            file = open(path+"/"+img_name,"wb")
            file.write(img_response.content)
            file.close()
        if more==0:
            print("没有更多数据了")
            break

抖音APP视频下载

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
from contextlib import closing
import requests, json, time, re, os, sys


class DouYin(object):
    def __init__(self):
        """
        抖音App视频下载
        """
        # SSL认证
        pass

    def get_video_urls(self, user_id):
        """
        获得视频播放地址
        Parameters:
            nickname:查询的用户名
        Returns:
            video_names: 视频名字列表
            video_urls: 视频链接列表
            aweme_count: 视频数量
        """
        video_names = []
        video_urls = []
        unique_id = ''
        while unique_id != user_id:
            search_url = 'https://api.amemv.com/aweme/v1/discover/search/?cursor=0&keyword=%s&count=10&type=1&retry_type=no_retry&iid=17900846586&device_id=34692364855&ac=wifi&channel=xiaomi&aid=1128&app_name=aweme&version_code=162&version_name=1.6.2&device_platform=android&ssmix=a&device_type=MI+5&device_brand=Xiaomi&os_api=24&os_version=7.0&uuid=861945034132187&openudid=dc451556fc0eeadb&manifest_version_code=162&resolution=1080*1920&dpi=480&update_version_code=1622' % user_id
            req = requests.get(url=search_url, verify=False)
            html = json.loads(req.text)
            aweme_count = html['user_list'][0]['user_info']['aweme_count']
            uid = html['user_list'][0]['user_info']['uid']
            nickname = html['user_list'][0]['user_info']['nickname']
            unique_id = html['user_list'][0]['user_info']['unique_id']
        user_url = 'https://www.douyin.com/aweme/v1/aweme/post/?user_id=%s&max_cursor=0&count=%s' % (uid, aweme_count)
        req = requests.get(url=user_url, verify=False)
        html = json.loads(req.text)
        i = 1
        for each in html['aweme_list']:
            share_desc = each['share_info']['share_desc']
            if '抖音-原创音乐短视频社区' == share_desc:
                video_names.append(str(i) + '.mp4')
                i += 1
            else:
                video_names.append(share_desc + '.mp4')
            video_urls.append(each['share_info']['share_url'])

        return video_names, video_urls, nickname

    def get_download_url(self, video_url):
        """
        获得视频播放地址
        Parameters:
            video_url:视频播放地址
        Returns:
            download_url: 视频下载地址
        """
        req = requests.get(url=video_url, verify=False)
        bf = BeautifulSoup(req.text, 'lxml')
        script = bf.find_all('script')[-1]
        video_url_js = re.findall('var data = \[(.+)\];', str(script))[0]
        video_html = json.loads(video_url_js)
        download_url = video_html['video']['play_addr']['url_list'][0]
        return download_url

    def video_downloader(self, video_url, video_name):
        """
        视频下载
        Parameters:
            None
        Returns:
            None
        """
        size = 0
        with closing(requests.get(video_url, stream=True, verify=False)) as response:
            chunk_size = 1024
            content_size = int(response.headers['content-length'])
            if response.status_code == 200:
                sys.stdout.write('  [文件大小]:%0.2f MB\n' % (content_size / chunk_size / 1024))

                with open(video_name, "wb") as file:
                    for data in response.iter_content(chunk_size=chunk_size):
                        file.write(data)
                        size += len(data)
                        file.flush()

                    sys.stdout.write('    [下载进度]:%.2f%%' % float(size / content_size * 100))
                    sys.stdout.flush()
        time.sleep(1)

    def run(self):
        """
        运行函数
        Parameters:
            None
        Returns:
            None
        """
        self.hello()
        # user_id = input('请输入ID(例如13978338):')
        user_id = 'sm666888'
        video_names, video_urls, nickname = self.get_video_urls(user_id)
        if nickname not in os.listdir():
            os.mkdir(nickname)
        sys.stdout.write('视频下载中:\n')
        for num in range(len(video_urls)):
            print('  %s\n' % video_urls[num])
            video_url = self.get_download_url(video_urls[num])
            if '\\' in video_names[num]:
                video_name = video_names[num].replace('\\', '')
            elif '/' in video_names[num]:
                video_name = video_names[num].replace('/', '')
            else:
                video_name = video_names[num]
            self.video_downloader(video_url, os.path.join(nickname, video_name))
            print('')

    def hello(self):
        """
        打印欢迎界面
        Parameters:
            None
        Returns:
            None
        """
        print('*' * 100)
        print('\t\t\t\t抖音App视频下载小助手')
        print('*' * 100)


if __name__ == '__main__':
    douyin = DouYin()
    douyin.run()

帅啊网

# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
import requests
import os
import time

if __name__ == '__main__':
    list_url = []
    for num in range(1,3):
        if num == 1:
            url = 'http://www.shuaia.net/index.html'
        else:
            url = 'http://www.shuaia.net/index_%d.html' % num
        headers = {
                "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
        }
        req = requests.get(url = url,headers = headers)
        req.encoding = 'utf-8'
        html = req.text
        bf = BeautifulSoup(html, 'lxml')
        targets_url = bf.find_all(class_='item-img')

        for each in targets_url:
            list_url.append(each.img.get('alt') + '=' + each.get('href'))

    print('连接采集完成')

    for each_img in list_url:
        img_info = each_img.split('=')
        target_url = img_info[1]
        filename = img_info[0] + '.jpg'
        print('下载:' + filename)
        headers = {
            "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
        }
        img_req = requests.get(url = target_url,headers = headers)
        img_req.encoding = 'utf-8'
        img_html = img_req.text
        img_bf_1 = BeautifulSoup(img_html, 'lxml')
        img_url = img_bf_1.find_all('div', class_='wr-single-content-list')
        img_bf_2 = BeautifulSoup(str(img_url), 'lxml')
        img_url = 'http://www.shuaia.net' + img_bf_2.div.img.get('src')
        if 'images' not in os.listdir():
            os.makedirs('images')
        urlretrieve(url = img_url,filename = 'images/' + filename)
        time.sleep(1)

    print('下载完成!')

笔趣网小说下载

# -*- coding:UTF-8 -*-
from urllib import request
from bs4 import BeautifulSoup
import collections
import re
import os
import time
import sys
import types

"""
类说明:下载《笔趣看》网小说: url:http://www.biqukan.com/

Parameters:
    target - 《笔趣看》网指定的小说目录地址(string)

Returns:
    无

Modify:
    2017-05-06
"""
class download(object):
    def __init__(self, target):
        self.__target_url = target
        self.__head = {'User-Agent':'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19',}

    """
    函数说明:获取下载链接

    Parameters:
        无

    Returns:
        novel_name + '.txt' - 保存的小说名(string)
        numbers - 章节数(int)
        download_dict - 保存章节名称和下载链接的字典(dict)

    Modify:
        2017-05-06
    """
    def get_download_url(self):
        charter = re.compile(u'[第弟](.+)章', re.IGNORECASE)
        target_req = request.Request(url = self.__target_url, headers = self.__head)
        target_response = request.urlopen(target_req)
        target_html = target_response.read().decode('gbk','ignore')
        listmain_soup = BeautifulSoup(target_html,'lxml')
        chapters = listmain_soup.find_all('div',class_ = 'listmain')
        download_soup = BeautifulSoup(str(chapters), 'lxml')
        novel_name = str(download_soup.dl.dt).split("》")[0][5:]
        flag_name = "《" + novel_name + "》" + "正文卷"
        numbers = (len(download_soup.dl.contents) - 1) / 2 - 8
        download_dict = collections.OrderedDict()
        begin_flag = False
        numbers = 1
        for child in download_soup.dl.children:
            if child != '\n':
                if child.string == u"%s" % flag_name:
                    begin_flag = True
                if begin_flag == True and child.a != None:
                    download_url = "http://www.biqukan.com" + child.a.get('href')
                    download_name = child.string
                    names = str(download_name).split('章')
                    name = charter.findall(names[0] + '章')
                    if name:
                            download_dict['第' + str(numbers) + '章 ' + names[1]] = download_url
                            numbers += 1
        return novel_name + '.txt', numbers, download_dict

    """
    函数说明:爬取文章内容

    Parameters:
        url - 下载连接(string)

    Returns:
        soup_text - 章节内容(string)

    Modify:
        2017-05-06
    """
    def Downloader(self, url):
        download_req = request.Request(url = url, headers = self.__head)
        download_response = request.urlopen(download_req)
        download_html = download_response.read().decode('gbk','ignore')
        soup_texts = BeautifulSoup(download_html, 'lxml')
        texts = soup_texts.find_all(id = 'content', class_ = 'showtxt')
        soup_text = BeautifulSoup(str(texts), 'lxml').div.text.replace('\xa0','')
        return soup_text

    """
    函数说明:将爬取的文章内容写入文件

    Parameters:
        name - 章节名称(string)
        path - 当前路径下,小说保存名称(string)
        text - 章节内容(string)

    Returns:
        无

    Modify:
        2017-05-06
    """
    def Writer(self, name, path, text):
        write_flag = True
        with open(path, 'a', encoding='utf-8') as f:
            f.write(name + '\n\n')
            for each in text:
                if each == 'h':
                    write_flag = False
                if write_flag == True and each != ' ':
                    f.write(each)
                if write_flag == True and each == '\r':
                    f.write('\n')            
            f.write('\n\n')

if __name__ == "__main__":

    #小说地址
    target_url = "http://www.biqukan.com/0_784/"

    #实例化下载类
    d = download(target = target_url)
    name, numbers, url_dict = d.get_download_url()
    if name in os.listdir():
        os.remove(name)
    index = 1

    #下载中
    print("《%s》下载中:" % name[:-4])
    for key, value in url_dict.items():
        d.Writer(key, name, d.Downloader(value))
        sys.stdout.write("已下载:%.1f%%" %  float(index/numbers*100) + '\r\n')
        sys.stdout.flush()
        index += 1    

    print("《%s》下载完成!" % name[:-4])

小说阅读网

https://www.readnovel.com/
爬完整个免费小说大致需要200天
# -*- coding: utf-8 -*-
import requests, os, time
from lxml import etree
from fake_useragent import UserAgent # 手动安装  pip install fake_useragent
agent = UserAgent()
# 创建文件夹,小说都放进去方便管理
os.makedirs("novel", exist_ok=True)
url = "https://www.readnovel.com/free/all?pageSize=10&gender=2&catId=-1&isFinish=-1&isVip=1&size=-1&updT=-1&orderBy=0&pageNum=2"
response = requests.get(url, headers={"User-Agent": agent.random})
root = etree.HTML(response.content)
novel_list = root.xpath("//div[@class='right-book-list']/ul/li")
for novel in novel_list:
    novel_title = novel.xpath("div/h3/a/@title")[0]
    novel_title = novel_title.strip()
    novel_href = novel.xpath("div/h3/a/@href")[0]
    if not novel_href.startswith("http"):
        novel_href = "https://www.readnovel.com" + novel_href
    novel_detail_response = requests.get(novel_href, headers={"User-Agent": agent.random})
    novel_detail_root = etree.HTML(novel_detail_response.content)
    novel_detail_href = novel_detail_root.xpath("//a[text()='免费试读']/@href")[0]
    if not novel_detail_href.startswith("http"):
        novel_detail_href = "https:" + novel_detail_href
    while True:
        novel_detail_response = requests.get(novel_detail_href, headers={"User-Agent": agent.random})
        novel_detail_root = etree.HTML(novel_detail_response.content)
        chapter_name = novel_detail_root.xpath("//h3/text()")[0]
        chapter_name = chapter_name.strip()
        novel_content = novel_detail_root.xpath("//div[@class='read-content j_readContent']/p/text()")
        if not novel_content:
            break
        file = open("novel/"+novel_title+".txt", "a", encoding="gb18030")
        file.write(chapter_name+"\n")
        file.close()
        print(chapter_name)
        for content in novel_content:
            content = "    " + content.strip()
            file = open("novel/"+novel_title+".txt", "a", encoding="gb18030")
            file.write(content+"\n")
            file.close()
        novel_detail_href = novel_detail_root.xpath("//a[text()='下一章']/@href")
        if not novel_detail_href:
            print("没有了")
            break
        else:
            novel_detail_href = novel_detail_href[0]
        if not novel_detail_href.startswith("http"):
            novel_detail_href = "https:" + novel_detail_href
        # 放慢爬虫步伐,防止因为太快被发现
        time.sleep(2)

纵横中文网(下载所有免费小说)

http://www.zongheng.com/

站长素材网

http://sc.chinaz.com/yinxiao/

微博

喜马拉雅

全书小说网

URL:http://www.quanshuwang.com/
API:http://www.quanshuwang.com/list/分类编号_页码编号.html
数据返回格式:HTML
#!/usr/bin/env python
# -*- coding:utf-8 -*-

""" 
@author: 张泽楠 
@contact: [email protected] 
@file: main.py 
@time: 2018/3/9 20:26 
"""
# -*- coding: utf-8 -*-
import requests
from lxml import etree

for page in range(1, 940):
    url = "http://www.quanshuwang.com/list/1_%s.html" % page
    response = requests.get(url)
    response.encoding = response.apparent_encoding
    root = etree.HTML(response.content)
    li_list = root.xpath("//ul[@class='seeWell cf']/li")
    for li in li_list:
        novel_title = li.xpath("span/a[@class='clearfix stitle']/@title")
        novel_title = novel_title[0] if novel_title else "没有小说名字"

        novel_user = li.xpath("span/a[2]/text()")
        novel_user = novel_user[0] if novel_user else "没有作者"

        novel_img_src = li.xpath("a/img/@src")
        novel_img_src = novel_img_src[0] if novel_img_src else "没有图片地址"
        print(novel_title, novel_user, novel_img_src)

飞卢小说网

URL:https://b.faloo.com
API:https://b.faloo.com/l/0/0/0/4/0/3/页码.html
数据返回格式:HTML
注意事项:

校花网

URL:http://www.521609.com/
API:http://www.521609.com/daxuexiaohua/list3页码.html
是否需要登录:否
数据返回格式:HTML

豆瓣电影Top250

URL:https://movie.douban.com
API:https://movie.douban.com/top250?start=(页码-1)*25&filter=
是否需要登录:否
数据返回格式:HTML
# -*- coding: utf-8 -*-
import requests
from lxml import etree

#3-50 4-75 5-100 n对应(n-1)*25
for page in range(1,2): 
    #"ZHANGSAN"*10表示字符串重复10次
    url = "https://movie.douban.com/top250?start=%s&filter="%((page-1)*25)
    response = requests.get(url)
    root = etree.HTML(response.content, parser=etree.HTMLParser(encoding='utf-8'))
    films = root.xpath("//ol[@class='grid_view']/li/div[@class='item']")     
    for film in films:
        film_name = film.xpath("div[@class='info']/div[@class='hd']/a/span/text()")

        #new_film_name = ""
        #for name in film_name:
        #    new_film_name = new_film_name + name
        #以上三句等价于
        #join:加入/拼接
        #将film_name中的每个元素按照空字符串拼接

        # 这里需要再次split然后拼接,因为标题中有"不间断空格",存起来是乱码
        new_film_name = "".join(film_name)
        new_film_name = "".join(new_film_name.split())


        film_star = film.xpath("div[@class='info']/div[@class='bd']/div[@class='star']/span[@class='rating_num']/text()")
        film_star = film_star[0] if film_star else "0.0"
        film_play = film.xpath("div[@class='info']/div[@class='hd']/span/text()")
        film_play = film_play[0] if film_play else "不可播放"
        #char:字符 chars:多个字符
        film_play = film_play.strip("[]")

        film_info = [new_film_name,film_star,film_play]
        file = open("film.csv","a",encoding="gb18030")
        info = ",".join(film_info)
        file.write(info+"\n")
        file.close()
# coding:utf-8
# __author__ = 'Gao'

# fake-useragent: 第三库User-Agent模块,它提供了最新的,最全面的User-Agent浏览器标识,支持谷歌,火狐,IE,Opera等主流浏览器的User-Agent值。
# 安装方法:pip install fake-useragent

import csv, codecs
import re

import requests
# 导入用于随机User-Agent值的第三方库
from fake_useragent import UserAgent
from lxml import etree


class DBMovie(object):
    def __init__(self):
        self.base_url = 'https://movie.douban.com/top250'
        self.ua = UserAgent()
        self.html_obj = None

    def get_page_code(self, url=''):
        """
        根据url获取网页源代码
        :param url: 从页面中的下一页标签中提取的相对地址 ?start=20&filter=
        :return:
        """
        # 拼接每一页的完整地址
        abs_url = self.base_url + url
        headers = {
            'User-Agent':self.ua.random
        }
        content = requests.get(abs_url, headers=headers).content
        self.html_obj = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8'))

        self.get_content_by_xpath(self.html_obj)
        # self.get_content_by_css(self.html_obj)

    def get_content_by_xpath(self, html_obj):
        """
        根据每一页的文档对象Element,使用xpath/cssselect取出相关信息。
        :param html_obj: 接收的某一页的根文档对象
        :return:
        """
        movie_list = []
        item_div = html_obj.xpath('//div[@class="item"]')
        for item_tag in item_div:
            movie_dict = {}
            # 获取em标签内部的电影排名
            em = item_tag.xpath('.//em/text()')[0]
            # 获取电影的简要信息
            hd = item_tag.xpath('.//div[@class="hd"]/a/span/text()')
            # 将hd中的三个信息拼接成一个字符串
            info = ''
            for info_text in hd:
                content = info_text.strip('\n').strip()
                info += content
            # 获取电影的详细信息
            # 演员介绍
            member_info = item_tag.xpath('.//p[@class=""]/text()')[0].strip('\n').strip()
            # 电影评分
            star_number = item_tag.xpath('.//span[@class="rating_num"]/text()')[0]
            # 电影评论数
            comment_number = item_tag.xpath('.//div[@class="star"]/span[last()]/text()')[0]
            comment_number = re.search(re.compile('(\d+)'), comment_number).group(1)
            # 电影点评
            quote = item_tag.xpath('.//span[@class="inq"]')
            if len(quote) != 0:
                quote = quote[0].xpath('text()')[0]
            else:
                quote = u'影评不存在'

            # 将以上数据添加到字典中
            movie_dict['movie_rank'] = em
            movie_dict['movie_name'] = info
            movie_dict['movie_member'] = member_info
            movie_dict['movie_star'] = star_number
            movie_dict['movie_comment'] = comment_number
            movie_dict['movie_quote'] = quote

            movie_list.append(movie_dict)

        # 将movie_list中的所有字典数据,写入到本地excel文件中
        self.write_movie_info(movie_list)

    def write_movie_info(self, movie_list):
        """
        将当前页的所有电影数据,写入到本地
        :param movie_list: 当前页的所有数据
        :return:
        """
        for movie in movie_list:
            self.writer.writerow(movie)

        # 当前页数据写入完毕,获取下一页的url
        self.get_next_page_url()

    def open_file(self):
        csv_file = open('movie1.csv', 'w', encoding="gb18030")
        self.writer = csv.DictWriter(csv_file,
                                fieldnames=['movie_rank', 'movie_name', 'movie_member', 'movie_star', 'movie_comment',
                                            'movie_quote'])
        self.writer.writeheader()

    def get_next_page_url(self):
        a = self.html_obj.xpath('//span[@class="next"]/a')
        if len(a) == 0:
            print('最后一页了')
            return
        next_page = a[0].xpath('@href')[0]
        self.get_page_code(next_page)

    def get_content_by_css(self, html_obj):
        item_div = html_obj.cssselect('.item')
        for div_tag in item_div:
            # 排名信息
            em = div_tag.cssselect('em')[0].text
            # 电影名称
            name_list = div_tag.cssselect('a>span')
            name = ''
            for name_tag in name_list:
                name_str = name_tag.text.strip().strip('\n')
                name += name_str

            # 演员信息
            bd = div_tag.cssselect('.bd')[0]
            p_list = bd.cssselect('p')
            if len(p_list) == 2:
                # 包含演员信息和影评信息
                member = p_list[0].text
                quote = p_list[1].cssselect('span')[0].text
            else:
                # 只有演员信息,没有影评
                member = p_list[0].text

            # 获取评分及评论数
            star_list = bd.cssselect('.star>span')
            star = star_list[1].text
            comment_number = star_list[3].text
            comment_number = re.search(re.compile('(\d+)'), comment_number).group(1)

            print ('css选择器选择完毕'
                   )

if __name__ == '__main__':
    movie_obj = DBMovie()
    movie_obj.open_file()
    movie_obj.get_page_code()

爬取豆瓣网图书TOP250数据

爬取豆瓣音乐TOP250的数据

爬取酷狗TOP500的数据

微信文章

爬取北京地区短租房信息

爬取《斗破苍穹》全文小说

爬取糗事百科网的段子信息

爬取糗事百科网的用户地址信息

爬取起点中文网小说信息

爬取PEXELS图片

爬取转转网二手市场商品信息

爬取简书网热评文章

爬取简书网用户动态信息

爬取简书网7日热门信息

爬取简书网热门专题信息

爬取简书网专题收录文章

爬取简书网推荐信息

爬取拉勾网招聘信息

爬取新浪微博好友圈信息

爬取QQ空间好友说说

爬取淘宝商品信息

爬取知乎网Python精华话题

results matching ""

    No results matching ""