古诗文
http://www.gushiwen.org/shiwen/
string(.//div[@class='contson'])
伯乐在线
URL:http://blog.jobbole.com
API:http://blog.jobbole.com/all-posts/page/页码/
是否需要登录:否
数据返回格式:HTML
组成:爬虫部分 + 搜索部分
可以通过pyinstaller -F ***.py把文件进行打包
import requests
from lxml import etree
for page in range(1,554):
url = "http://blog.jobbole.com/all-posts/page/%s/"%page
print(url)
response = requests.get(url)
root = etree.HTML(response.content)
div_list = root.xpath("//div[@class='grid-8']/div[@class='post floated-thumb']")
for div in div_list:
bole_title = div.xpath("div[@class='post-meta']/p/a[@class='archive-title']/@title")
bole_title = bole_title[0] if bole_title else "没有标题"
bole_title = bole_title.strip().replace(",",",")
bole_url = div.xpath("div[@class='post-meta']/p/a[@class='archive-title']/@href")
bole_url = bole_url[0] if bole_url else "没有网址"
bole_url = bole_url.strip()
if not bole_url.startswith("http"):
bole_url = "" + bole_url
print("网址不完整",bole_url)
bole_date = div.xpath("div[@class='post-meta']/p/text()")
bole_date = "".join(bole_date).replace("·","").replace(",","").strip()
bole_cate = div.xpath("div[@class='post-meta']/p/a[@rel='category tag']/text()")
bole_cate = ".".join(bole_cate)
bole_img_src = div.xpath("div[@class='post-thumb']/a/img/@src")
bole_img_src = bole_img_src[0].strip() if bole_img_src else "没有图片地址"
if not bole_img_src.startswith("http") and "没有图片地址" not in bole_img_src:
bole_img_src = "http://blog.jobbole.com"+ bole_img_src
bole_info = [bole_title,bole_date,bole_cate,bole_img_src,bole_url]
f = open("bole.csv","a",encoding="gb18030")
f.write(",".join(bole_info)+"\n")
f.close()
keyword = input("请输入搜索关键字(回车退出)")
keyword = keyword.lower()
f = open("bole.csv","r",encoding="gb18030")
lines = f.readlines()
f.close()
while keyword:
for line in lines:
infos = line.split(",")
bole_title = infos[0]
bole_url = infos[4]
if keyword in bole_title.lower():
print(bole_title,bole_url)
keyword = input("请输入搜索关键字(回车退出)")
keyword = keyword.lower()
print("欢迎再次使用本系统(按任意键退出)")
堆糖网
URL:https://www.duitang.com/
API:https://www.duitang.com/napi/blog/list/by_search/?kw=美女&start=24
是否需要登录:否
数据返回格式:JSON
import requests
import threading
import os
thread_lock = threading.BoundedSemaphore(value=10)
def get_page(url):
page = requests.get(url)
page = page.content
page = page.decode("utf-8")
return page
def findall_in_page(page, startpart, endpart):
all_strings = []
end = 0
while page.find(startpart, end) != -1:
start = page.find(startpart, end) + len(startpart)
end = page.find(endpart, start)
string = page[start:end]
all_strings.append(string)
return all_strings
def page_from_duitang(label):
pages = []
url = "https://www.duitang.com/napi/blog/list/by_search/?kw={}&start={}"
for index in range(0, 3600, 100):
u = url.format(label, index)
print(u)
page = get_page(u)
pages.append(page)
return pages
def pic_urls_from_pages(pages):
pic_urls = []
for page in pages:
urls = findall_in_page(page, 'path":"', '"')
pic_urls.extend(urls)
return pic_urls
def download_pics(url, n):
r = requests.get(url)
if not os.path.exists("pics"):
os.makedirs("pics")
path = 'pics/' + str(n) + '.jpg'
with open(path, "wb") as f:
f.write(r.content)
thread_lock.release()
def main(label):
pages = page_from_duitang(label)
pic_urls = pic_urls_from_pages(pages)
for idx, url in enumerate(pic_urls):
print("正在下载第{}张图片".format(idx))
thread_lock.acquire()
t = threading.Thread(target=download_pics, args=(url, idx))
t.start()
if __name__ == '__main__':
main("美女")
import requests
import json
import os
while True:
kw = input("请输入搜索关键字(换行退出):")
if not kw:
print("退出搜索")
break
path = "imgs/"+kw
if not os.path.exists(path):
os.makedirs(path)
for page in range(1,2):
url = "https://www.duitang.com/napi/blog/list/by_search/?kw=%s&start=%s"%(kw, (page-1)*24)
print(url)
response = requests.get(url)
json_obg = json.loads(response.content)
more = json_obg["data"]["more"]
object_list = json_obg["data"]["object_list"]
for obj in object_list:
img_src = obj["photo"]["path"]
img_response = requests.get(img_src)
img_name = img_src.split("/")[-1]
file = open(path+"/"+img_name,"wb")
file.write(img_response.content)
file.close()
if more==0:
print("没有更多数据了")
break
抖音APP视频下载
from bs4 import BeautifulSoup
from contextlib import closing
import requests, json, time, re, os, sys
class DouYin(object):
def __init__(self):
"""
抖音App视频下载
"""
pass
def get_video_urls(self, user_id):
"""
获得视频播放地址
Parameters:
nickname:查询的用户名
Returns:
video_names: 视频名字列表
video_urls: 视频链接列表
aweme_count: 视频数量
"""
video_names = []
video_urls = []
unique_id = ''
while unique_id != user_id:
search_url = 'https://api.amemv.com/aweme/v1/discover/search/?cursor=0&keyword=%s&count=10&type=1&retry_type=no_retry&iid=17900846586&device_id=34692364855&ac=wifi&channel=xiaomi&aid=1128&app_name=aweme&version_code=162&version_name=1.6.2&device_platform=android&ssmix=a&device_type=MI+5&device_brand=Xiaomi&os_api=24&os_version=7.0&uuid=861945034132187&openudid=dc451556fc0eeadb&manifest_version_code=162&resolution=1080*1920&dpi=480&update_version_code=1622' % user_id
req = requests.get(url=search_url, verify=False)
html = json.loads(req.text)
aweme_count = html['user_list'][0]['user_info']['aweme_count']
uid = html['user_list'][0]['user_info']['uid']
nickname = html['user_list'][0]['user_info']['nickname']
unique_id = html['user_list'][0]['user_info']['unique_id']
user_url = 'https://www.douyin.com/aweme/v1/aweme/post/?user_id=%s&max_cursor=0&count=%s' % (uid, aweme_count)
req = requests.get(url=user_url, verify=False)
html = json.loads(req.text)
i = 1
for each in html['aweme_list']:
share_desc = each['share_info']['share_desc']
if '抖音-原创音乐短视频社区' == share_desc:
video_names.append(str(i) + '.mp4')
i += 1
else:
video_names.append(share_desc + '.mp4')
video_urls.append(each['share_info']['share_url'])
return video_names, video_urls, nickname
def get_download_url(self, video_url):
"""
获得视频播放地址
Parameters:
video_url:视频播放地址
Returns:
download_url: 视频下载地址
"""
req = requests.get(url=video_url, verify=False)
bf = BeautifulSoup(req.text, 'lxml')
script = bf.find_all('script')[-1]
video_url_js = re.findall('var data = \[(.+)\];', str(script))[0]
video_html = json.loads(video_url_js)
download_url = video_html['video']['play_addr']['url_list'][0]
return download_url
def video_downloader(self, video_url, video_name):
"""
视频下载
Parameters:
None
Returns:
None
"""
size = 0
with closing(requests.get(video_url, stream=True, verify=False)) as response:
chunk_size = 1024
content_size = int(response.headers['content-length'])
if response.status_code == 200:
sys.stdout.write(' [文件大小]:%0.2f MB\n' % (content_size / chunk_size / 1024))
with open(video_name, "wb") as file:
for data in response.iter_content(chunk_size=chunk_size):
file.write(data)
size += len(data)
file.flush()
sys.stdout.write(' [下载进度]:%.2f%%' % float(size / content_size * 100))
sys.stdout.flush()
time.sleep(1)
def run(self):
"""
运行函数
Parameters:
None
Returns:
None
"""
self.hello()
user_id = 'sm666888'
video_names, video_urls, nickname = self.get_video_urls(user_id)
if nickname not in os.listdir():
os.mkdir(nickname)
sys.stdout.write('视频下载中:\n')
for num in range(len(video_urls)):
print(' %s\n' % video_urls[num])
video_url = self.get_download_url(video_urls[num])
if '\\' in video_names[num]:
video_name = video_names[num].replace('\\', '')
elif '/' in video_names[num]:
video_name = video_names[num].replace('/', '')
else:
video_name = video_names[num]
self.video_downloader(video_url, os.path.join(nickname, video_name))
print('')
def hello(self):
"""
打印欢迎界面
Parameters:
None
Returns:
None
"""
print('*' * 100)
print('\t\t\t\t抖音App视频下载小助手')
print('*' * 100)
if __name__ == '__main__':
douyin = DouYin()
douyin.run()
帅啊网
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
import requests
import os
import time
if __name__ == '__main__':
list_url = []
for num in range(1,3):
if num == 1:
url = 'http://www.shuaia.net/index.html'
else:
url = 'http://www.shuaia.net/index_%d.html' % num
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
req = requests.get(url = url,headers = headers)
req.encoding = 'utf-8'
html = req.text
bf = BeautifulSoup(html, 'lxml')
targets_url = bf.find_all(class_='item-img')
for each in targets_url:
list_url.append(each.img.get('alt') + '=' + each.get('href'))
print('连接采集完成')
for each_img in list_url:
img_info = each_img.split('=')
target_url = img_info[1]
filename = img_info[0] + '.jpg'
print('下载:' + filename)
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
img_req = requests.get(url = target_url,headers = headers)
img_req.encoding = 'utf-8'
img_html = img_req.text
img_bf_1 = BeautifulSoup(img_html, 'lxml')
img_url = img_bf_1.find_all('div', class_='wr-single-content-list')
img_bf_2 = BeautifulSoup(str(img_url), 'lxml')
img_url = 'http://www.shuaia.net' + img_bf_2.div.img.get('src')
if 'images' not in os.listdir():
os.makedirs('images')
urlretrieve(url = img_url,filename = 'images/' + filename)
time.sleep(1)
print('下载完成!')
笔趣网小说下载
from urllib import request
from bs4 import BeautifulSoup
import collections
import re
import os
import time
import sys
import types
"""
类说明:下载《笔趣看》网小说: url:http://www.biqukan.com/
Parameters:
target - 《笔趣看》网指定的小说目录地址(string)
Returns:
无
Modify:
2017-05-06
"""
class download(object):
def __init__(self, target):
self.__target_url = target
self.__head = {'User-Agent':'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19',}
"""
函数说明:获取下载链接
Parameters:
无
Returns:
novel_name + '.txt' - 保存的小说名(string)
numbers - 章节数(int)
download_dict - 保存章节名称和下载链接的字典(dict)
Modify:
2017-05-06
"""
def get_download_url(self):
charter = re.compile(u'[第弟](.+)章', re.IGNORECASE)
target_req = request.Request(url = self.__target_url, headers = self.__head)
target_response = request.urlopen(target_req)
target_html = target_response.read().decode('gbk','ignore')
listmain_soup = BeautifulSoup(target_html,'lxml')
chapters = listmain_soup.find_all('div',class_ = 'listmain')
download_soup = BeautifulSoup(str(chapters), 'lxml')
novel_name = str(download_soup.dl.dt).split("》")[0][5:]
flag_name = "《" + novel_name + "》" + "正文卷"
numbers = (len(download_soup.dl.contents) - 1) / 2 - 8
download_dict = collections.OrderedDict()
begin_flag = False
numbers = 1
for child in download_soup.dl.children:
if child != '\n':
if child.string == u"%s" % flag_name:
begin_flag = True
if begin_flag == True and child.a != None:
download_url = "http://www.biqukan.com" + child.a.get('href')
download_name = child.string
names = str(download_name).split('章')
name = charter.findall(names[0] + '章')
if name:
download_dict['第' + str(numbers) + '章 ' + names[1]] = download_url
numbers += 1
return novel_name + '.txt', numbers, download_dict
"""
函数说明:爬取文章内容
Parameters:
url - 下载连接(string)
Returns:
soup_text - 章节内容(string)
Modify:
2017-05-06
"""
def Downloader(self, url):
download_req = request.Request(url = url, headers = self.__head)
download_response = request.urlopen(download_req)
download_html = download_response.read().decode('gbk','ignore')
soup_texts = BeautifulSoup(download_html, 'lxml')
texts = soup_texts.find_all(id = 'content', class_ = 'showtxt')
soup_text = BeautifulSoup(str(texts), 'lxml').div.text.replace('\xa0','')
return soup_text
"""
函数说明:将爬取的文章内容写入文件
Parameters:
name - 章节名称(string)
path - 当前路径下,小说保存名称(string)
text - 章节内容(string)
Returns:
无
Modify:
2017-05-06
"""
def Writer(self, name, path, text):
write_flag = True
with open(path, 'a', encoding='utf-8') as f:
f.write(name + '\n\n')
for each in text:
if each == 'h':
write_flag = False
if write_flag == True and each != ' ':
f.write(each)
if write_flag == True and each == '\r':
f.write('\n')
f.write('\n\n')
if __name__ == "__main__":
target_url = "http://www.biqukan.com/0_784/"
d = download(target = target_url)
name, numbers, url_dict = d.get_download_url()
if name in os.listdir():
os.remove(name)
index = 1
print("《%s》下载中:" % name[:-4])
for key, value in url_dict.items():
d.Writer(key, name, d.Downloader(value))
sys.stdout.write("已下载:%.1f%%" % float(index/numbers*100) + '\r\n')
sys.stdout.flush()
index += 1
print("《%s》下载完成!" % name[:-4])
小说阅读网
https://www.readnovel.com/
爬完整个免费小说大致需要200天
# -*- coding: utf-8 -*-
import requests, os, time
from lxml import etree
from fake_useragent import UserAgent # 手动安装 pip install fake_useragent
agent = UserAgent()
# 创建文件夹,小说都放进去方便管理
os.makedirs("novel", exist_ok=True)
url = "https://www.readnovel.com/free/all?pageSize=10&gender=2&catId=-1&isFinish=-1&isVip=1&size=-1&updT=-1&orderBy=0&pageNum=2"
response = requests.get(url, headers={"User-Agent": agent.random})
root = etree.HTML(response.content)
novel_list = root.xpath("//div[@class='right-book-list']/ul/li")
for novel in novel_list:
novel_title = novel.xpath("div/h3/a/@title")[0]
novel_title = novel_title.strip()
novel_href = novel.xpath("div/h3/a/@href")[0]
if not novel_href.startswith("http"):
novel_href = "https://www.readnovel.com" + novel_href
novel_detail_response = requests.get(novel_href, headers={"User-Agent": agent.random})
novel_detail_root = etree.HTML(novel_detail_response.content)
novel_detail_href = novel_detail_root.xpath("//a[text()='免费试读']/@href")[0]
if not novel_detail_href.startswith("http"):
novel_detail_href = "https:" + novel_detail_href
while True:
novel_detail_response = requests.get(novel_detail_href, headers={"User-Agent": agent.random})
novel_detail_root = etree.HTML(novel_detail_response.content)
chapter_name = novel_detail_root.xpath("//h3/text()")[0]
chapter_name = chapter_name.strip()
novel_content = novel_detail_root.xpath("//div[@class='read-content j_readContent']/p/text()")
if not novel_content:
break
file = open("novel/"+novel_title+".txt", "a", encoding="gb18030")
file.write(chapter_name+"\n")
file.close()
print(chapter_name)
for content in novel_content:
content = " " + content.strip()
file = open("novel/"+novel_title+".txt", "a", encoding="gb18030")
file.write(content+"\n")
file.close()
novel_detail_href = novel_detail_root.xpath("//a[text()='下一章']/@href")
if not novel_detail_href:
print("没有了")
break
else:
novel_detail_href = novel_detail_href[0]
if not novel_detail_href.startswith("http"):
novel_detail_href = "https:" + novel_detail_href
# 放慢爬虫步伐,防止因为太快被发现
time.sleep(2)
纵横中文网(下载所有免费小说)
http://www.zongheng.com/
站长素材网
http://sc.chinaz.com/yinxiao/
微博
喜马拉雅
全书小说网
URL:http://www.quanshuwang.com/
API:http://www.quanshuwang.com/list/分类编号_页码编号.html
数据返回格式:HTML
"""
@author: 张泽楠
@contact: [email protected]
@file: main.py
@time: 2018/3/9 20:26
"""
import requests
from lxml import etree
for page in range(1, 940):
url = "http://www.quanshuwang.com/list/1_%s.html" % page
response = requests.get(url)
response.encoding = response.apparent_encoding
root = etree.HTML(response.content)
li_list = root.xpath("//ul[@class='seeWell cf']/li")
for li in li_list:
novel_title = li.xpath("span/a[@class='clearfix stitle']/@title")
novel_title = novel_title[0] if novel_title else "没有小说名字"
novel_user = li.xpath("span/a[2]/text()")
novel_user = novel_user[0] if novel_user else "没有作者"
novel_img_src = li.xpath("a/img/@src")
novel_img_src = novel_img_src[0] if novel_img_src else "没有图片地址"
print(novel_title, novel_user, novel_img_src)
飞卢小说网
URL:https://b.faloo.com
API:https://b.faloo.com/l/0/0/0/4/0/3/页码.html
数据返回格式:HTML
注意事项:
校花网
URL:http://www.521609.com/
API:http://www.521609.com/daxuexiaohua/list3页码.html
是否需要登录:否
数据返回格式:HTML
豆瓣电影Top250
URL:https://movie.douban.com
API:https://movie.douban.com/top250?start=(页码-1)*25&filter=
是否需要登录:否
数据返回格式:HTML
import requests
from lxml import etree
for page in range(1,2):
url = "https://movie.douban.com/top250?start=%s&filter="%((page-1)*25)
response = requests.get(url)
root = etree.HTML(response.content, parser=etree.HTMLParser(encoding='utf-8'))
films = root.xpath("//ol[@class='grid_view']/li/div[@class='item']")
for film in films:
film_name = film.xpath("div[@class='info']/div[@class='hd']/a/span/text()")
new_film_name = "".join(film_name)
new_film_name = "".join(new_film_name.split())
film_star = film.xpath("div[@class='info']/div[@class='bd']/div[@class='star']/span[@class='rating_num']/text()")
film_star = film_star[0] if film_star else "0.0"
film_play = film.xpath("div[@class='info']/div[@class='hd']/span/text()")
film_play = film_play[0] if film_play else "不可播放"
film_play = film_play.strip("[]")
film_info = [new_film_name,film_star,film_play]
file = open("film.csv","a",encoding="gb18030")
info = ",".join(film_info)
file.write(info+"\n")
file.close()
import csv, codecs
import re
import requests
from fake_useragent import UserAgent
from lxml import etree
class DBMovie(object):
def __init__(self):
self.base_url = 'https://movie.douban.com/top250'
self.ua = UserAgent()
self.html_obj = None
def get_page_code(self, url=''):
"""
根据url获取网页源代码
:param url: 从页面中的下一页标签中提取的相对地址 ?start=20&filter=
:return:
"""
abs_url = self.base_url + url
headers = {
'User-Agent':self.ua.random
}
content = requests.get(abs_url, headers=headers).content
self.html_obj = etree.HTML(content, parser=etree.HTMLParser(encoding='utf-8'))
self.get_content_by_xpath(self.html_obj)
def get_content_by_xpath(self, html_obj):
"""
根据每一页的文档对象Element,使用xpath/cssselect取出相关信息。
:param html_obj: 接收的某一页的根文档对象
:return:
"""
movie_list = []
item_div = html_obj.xpath('//div[@class="item"]')
for item_tag in item_div:
movie_dict = {}
em = item_tag.xpath('.//em/text()')[0]
hd = item_tag.xpath('.//div[@class="hd"]/a/span/text()')
info = ''
for info_text in hd:
content = info_text.strip('\n').strip()
info += content
member_info = item_tag.xpath('.//p[@class=""]/text()')[0].strip('\n').strip()
star_number = item_tag.xpath('.//span[@class="rating_num"]/text()')[0]
comment_number = item_tag.xpath('.//div[@class="star"]/span[last()]/text()')[0]
comment_number = re.search(re.compile('(\d+)'), comment_number).group(1)
quote = item_tag.xpath('.//span[@class="inq"]')
if len(quote) != 0:
quote = quote[0].xpath('text()')[0]
else:
quote = u'影评不存在'
movie_dict['movie_rank'] = em
movie_dict['movie_name'] = info
movie_dict['movie_member'] = member_info
movie_dict['movie_star'] = star_number
movie_dict['movie_comment'] = comment_number
movie_dict['movie_quote'] = quote
movie_list.append(movie_dict)
self.write_movie_info(movie_list)
def write_movie_info(self, movie_list):
"""
将当前页的所有电影数据,写入到本地
:param movie_list: 当前页的所有数据
:return:
"""
for movie in movie_list:
self.writer.writerow(movie)
self.get_next_page_url()
def open_file(self):
csv_file = open('movie1.csv', 'w', encoding="gb18030")
self.writer = csv.DictWriter(csv_file,
fieldnames=['movie_rank', 'movie_name', 'movie_member', 'movie_star', 'movie_comment',
'movie_quote'])
self.writer.writeheader()
def get_next_page_url(self):
a = self.html_obj.xpath('//span[@class="next"]/a')
if len(a) == 0:
print('最后一页了')
return
next_page = a[0].xpath('@href')[0]
self.get_page_code(next_page)
def get_content_by_css(self, html_obj):
item_div = html_obj.cssselect('.item')
for div_tag in item_div:
em = div_tag.cssselect('em')[0].text
name_list = div_tag.cssselect('a>span')
name = ''
for name_tag in name_list:
name_str = name_tag.text.strip().strip('\n')
name += name_str
bd = div_tag.cssselect('.bd')[0]
p_list = bd.cssselect('p')
if len(p_list) == 2:
member = p_list[0].text
quote = p_list[1].cssselect('span')[0].text
else:
member = p_list[0].text
star_list = bd.cssselect('.star>span')
star = star_list[1].text
comment_number = star_list[3].text
comment_number = re.search(re.compile('(\d+)'), comment_number).group(1)
print ('css选择器选择完毕'
)
if __name__ == '__main__':
movie_obj = DBMovie()
movie_obj.open_file()
movie_obj.get_page_code()
爬取豆瓣网图书TOP250数据
爬取豆瓣音乐TOP250的数据
爬取酷狗TOP500的数据
微信文章
爬取北京地区短租房信息
爬取《斗破苍穹》全文小说
爬取糗事百科网的段子信息
爬取糗事百科网的用户地址信息
爬取起点中文网小说信息
爬取PEXELS图片
爬取转转网二手市场商品信息
爬取简书网热评文章
爬取简书网用户动态信息
爬取简书网7日热门信息
爬取简书网热门专题信息
爬取简书网专题收录文章
爬取简书网推荐信息
爬取拉勾网招聘信息
爬取新浪微博好友圈信息
爬取QQ空间好友说说
爬取淘宝商品信息
爬取知乎网Python精华话题