|
#效果演示:同步17k,当然,我只采集VIP小说。后台挂着,只要17k有vip小说更新,则自动同步。
#!coding=UTF-8
import urllib
import urllib2
import commands
import time
import threading
import os
import re
import sys
thlen = 10
#定义同时采集的线程数
books = []
#定义需要采集的书库
tsk = []
#定义采集线程数组
bookdict = {}
#定义已采集图书字典,key为目标站书号,value为字数
domain = 'yoursite.domain.com'
adminuser = 'admin'
adminpass = '******'
siteid = '23'
# notaddnew = '0'
frompage = 'http://all.17k.com/lib/book/2_0_0_0_0_0_2_0_1.html'
def addbooklist():
while 1:
time.sleep(30)
print '[' + time.strftime('%H:%M:%S') + '] 采集更新列表线程启动。'
start = time.time()
try:
response = urllib2.urlopen(frompage, timeout = 12)
content = response.read()
except:
continue
response.close()
elapsed = (time.time() - start)
bookattr = re.findall(r'<a class=\"jt\" rel=\"/tip\.xhtml\?book\.id=([0-9]+)\&difference[^>]+>([^<]+)</a>*[\s\S]*?<td class=\"td5\">([0-9]+)</td>',content,re.M)
print '[' + time.strftime('%H:%M:%S') + '] 采集更新列表结束,用时:' + str(elapsed) + '秒'
for ii in range(len(bookattr)):
newbookid = bookattr[ii][0]
newbookname = bookattr[ii][1]
newbooksize = bookattr[ii][2]
inlist = False
for tt in range(len(books)):
if (books[tt][0]==newbookid):
inlist = True
if not inlist:
#书号不在待采集数组里
if (newbookid in bookdict.keys()):
#书号在已采集过的字典里(需要根据字数来判断是否有更新)
if (int(newbooksize)>int(bookdict[newbookid])):
#采集到书籍字数大于已采集字典里的字数(添加到待采集列表)
books.append([newbookid,newbookname,newbooksize])
print '书号:' + newbookid + '有更新,旧字数:'+ bookdict[newbookid] + ' 新字数:'+ newbooksize + ' 添加到待采集列表。'
else:
#书号不在已采集过的字典里(添加到待采集列表)
books.append([newbookid,newbookname,newbooksize])
print '书号:' + newbookid + '最近未采集,添加到待采集列表。'
print '[' + time.strftime('%H:%M:%S') + '] 采集更新列表线程完成,线程休眠。'
def caiji(bookid,bookname,booksize):
print '正在采集 书号[' + bookid + '] 书名:' + bookname
url = 'http://'+ domain + '/modules/article/admin/batchcollect.php?action=bcollect&siteid=' + siteid + '&batchids=' + bookid + '&jieqi_username=' + adminuser + '&jieqi_userpassword=' + adminpass
start = time.time()
page = urllib2.urlopen(url,timeout=3600)
data = page.read(8192)
while data:
data = page.read(8192)
page.close()
elapsed = (time.time() - start)
time.sleep(5) #采集完等5秒生成全书
print '书号[' + bookid + '] 书名:' + bookname + '字数:' + booksize + 'k 采集完成! 用时:' + str(elapsed) + '秒'
print '书号[' + bookid + '] 书名:' + bookname + '字数:' + booksize + 'k 添加到最近采集书目字典。'
# 从网页获取要采集的文章ID和文章名字(首次)
start = time.time()
response = urllib2.urlopen(frompage, timeout = 12)
content = response.read()
response.close()
elapsed = (time.time() - start)
getattr = re.findall(r'<a class=\"jt\" rel=\"/tip\.xhtml\?book\.id=([0-9]+)\&difference[^>]+>([^<]+)</a>*[\s\S]*?<td class=\"td5\">([0-9]+)</td>',content,re.M)
#getsize = re.findall(r'<td class=\"td5\">([0-9]+)</td>',content,re.M)
print '首次获取要采集的文章共' + str(len(getattr)) +'篇,用时:' + str(elapsed) + '秒'
books = books + getattr
if (len(books)<3):
print('获取列表页失败,退出!')
exit()
#启动书籍列表采集线程
thaddbooklist = threading.Thread(target=addbooklist,name='taddbooklist')
thaddbooklist.start()
for x in range(thlen):
bookid = books[0][0]
bookname = books[0][1]
booksize = books[0][2]
tname = 't' + str(x)
th = threading.Thread(target=caiji,name=tname,args=(bookid,bookname,booksize))
th.start()
del books[0]
bookdict[bookid] = booksize
tsk.append(th)
#检测空闲线程,当线程闲置时,若待采集列表不为空时,启用该线程进行采集
while 1:
time.sleep(5)
for i in range(len(tsk)):
if not tsk[i].is_alive():
print tsk[i].name + '线程空闲'
if len(books) > 0:
bookid = books[0][0]
bookname = books[0][1]
booksize = books[0][2]
th = threading.Thread(target=caiji,name=tsk[i].name,args=(bookid,bookname,booksize))
th.start()
del books[0]
bookdict[bookid] = booksize
tsk[i] = th
#siteid = '23' 采集规则的ID 杰奇后台和配置文件里看
补充运行输出:
[11:10:44] 采集更新列表线程启动。
[11:10:45] 采集更新列表结束,用时:0.368046998978秒
书号:1257715无更新,旧字数:508549 新字数:508549 忽略。
书号:437108无更新,旧字数:3070245 新字数:3070245 忽略
#自动登录后台,自动抢购脚本。
#vmid:自己登录后创建VPS,然后看源码。里面有vmid字样
email:登录用户名email
password:这个不说了。。。
代码复制下来 保存为op.py
linux下运行命令 python /目录/op.py
目前linux一般都带python,哪怕是最小安装。
没有的话
Debian/Ubuntu apt-get install python
CentOS/Rhel yum install python
------------------------
# -*- coding: utf-8 -*-
# !/usr/bin/python
################ vpskk原创作品,转载请注明出处http://www.vpskk.com #####################
import urllib2
import urllib
import cookielib
import re
import time
vmid = "your vm id";
email = "your email";
password = "your password";
auth_url = 'https://panel.op-net.com/login'
check_url = 'https://panel.op-net.com/cloud/open';
create_url = 'https://panel.op-net.com/cloud/open';
hk_check_str = '<span>Hong Kong';
jp_check_str = '<span>Tokyo';
# 登陆用户名和密码
data={
"email":email,
"password":password,
"submit":"Sign in"
}
#准备创建vm检测页面数据
checkdata={
"vm_id":vmid,
"x":"19",
"y":"24",
}
i = 1
# 初始化一个CookieJar来处理Cookie
cookieJar=cookielib.CookieJar()
# 实例化一个全局opener
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar))
while 1:
try:
time.sleep(10)
print '[' + time.strftime('%H:%M:%S') + ']第' + str(i) +'次检测。'
i = i +1
# urllib进行编码
post_data=urllib.urlencode(data)
check_data=urllib.urlencode(checkdata)
# 发送头信息
headers ={
"Host":"panel.op-net.com",
"Referer":auth_url,
"User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36",
}
# 访问检查页 如果获取到csrf则不需要登陆
reqcheck=urllib2.Request(check_url,check_data,headers)
resultcheck = opener.open(reqcheck)
csrf=re.findall(r'csrf_token" value="([0-9a-z]+)"/>',resultcheck.read(),re.M)
if (len(csrf)>0):#找到csrf,不用再登陆
print "have csrf:" + csrf[0]
else:
print "No csrf,auto login......"
# 登陆获取cookie
req=urllib2.Request(auth_url,post_data,headers)
result = opener.open(req)
print "Login OK,next get csrf."
# 访问检查页 重新获取csrf
reqcheck=urllib2.Request(check_url,check_data,headers)
resultcheck = opener.open(reqcheck)
html = resultcheck.read()
csrf=re.findall(r'csrf_token" value="([0-9a-z]+)"/>',html,re.M)
if (len(csrf)>0):#找到csrf
print "new get csrf:" + csrf[0]
else:
print "no csrf,quit."
continue
if not hk_check_str in html:
print "HK is available,next create it!"
location = "13"
elif not jp_check_str in html:
print "JP is available,next create it!"
location = "14"
else:
print "HK and JP is unavaileable,quit."
continue
# 创建VM数据
create={
'csrf_token':csrf[0],
'plan':'Plan 01',
'vm_id':vmid,
'location':location,
'os':'linux-debian-6.0.6-x86_64-min-gen2-v1',
'hostname':'op.vpskk.com',
'root':'',
}
# urllib进行编码
create_data=urllib.urlencode(create)
reqcreate=urllib2.Request(create_url,create_data,headers)
result = opener.open(reqcreate)
if "The requested location is currently unavailable" in result.read():
print "unavailable...waiting for checking again......"
else:
print "Create VM OK"
break
except:
continue |
|