还是第一次写这种东西,不知道会不会被骂,测试的过程还挺繁复的
http://www.****.com的图片,看接口有72134,但是我数据库里面只有72201条数据,下载下来的文件有72071个,应该是有重复的,没分文件夹,
这个是首页瀑布流的数据,文章页也爬下来了,
#!/usr/bin/python
import requests
import cymysql
import threading
import time
import os
def mkdir(path):
path=path.strip()
path=path.rstrip("\")
isExists=os.path.exists(path)
if not isExists:
os.makedirs(path)
return True
else:
return False
def getData(r):
try:
trem = r[4]
titile = r[2]
mkdir("./topics_img/" + titile)
file_url = "http://cdn.girlimg.com/images/" + r[4]
header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36",
"Referer": "http://www.girlimg.com/images/0"
}
r = requests.get(file_url, stream=True, headers=header, timeout=600000)
with open("./topics_img/" + titile + "/" + trem + ".jpg", "wb") as img:
print(file_url)
for chunk in r.iter_content(chunk_size=1024):
if chunk:
img.write(chunk)
except:
print("wrong: {0}".r)
def main():
conn = cymysql.connect(host='127.0.0.1', user='root', passwd='', db='girlimg', charset='utf8')
cur = conn.cursor()
# cur.execute('SELECT * FROM `img` WHERE id > 14249')
cur.execute('SELECT * FROM `topics_img`')
i = 0
for r in cur.fetchall():
time.sleep(0.1)
worker = threading.Thread(target=getData, args=(r,))
worker.start()
if __name__ == '__main__':
main()