漏洞扫描器–url的获取与处理

除开各种payload与exp,编写一个扫描器最重要的就是url的获取与处理

其中通过获取url能发散做的一些事,敏感信息页面处理获取,漏洞检测处理获取等等一切都是以此为基础,以前个人也编写了,今天再次优化了部分代码

以前的代码:http://www.youknowi.xin/2019/04/%e5%a4%9a%e8%bf%9b%e7%a8%8b%e5%b5%8c%e5%a5%97%e5%a4%9a%e8%bf%9b%e7%a8%8b%e5%ae%9e%e7%8e%b0url%e7%88%ac%e5%8f%96/(可比对博主做了哪些优化)

今天修改的代码:

# coding=utf-8
import requests
import re
from urllib.parse import urlparse
from urllib import parse
import sys
import os
import vthread
import random
import urllib3 #why? https://www.cnblogs.com/xuchunlin/p/7607507.html
urllib3.disable_warnings()
requests.adapters.DEFAULT_RETRIES = 3

class crawl_target(object):
def __init__(self, url):
self.url = url
self.collect_url = set() #爬取的网址
self.collect_target_url=set() #可以进行漏洞扫描的网址
self.headers= {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1','Connection':'close'}
# self.num=random.randint(10,20) # 爬取深度
self.num=2
self.filter_strings = ['.png', '.ico','.bmp','.svg','ä',\
'.gif','/css/','/js/','about:blank','.svg', \
'.jpeg','.json','.js','css','pdf',\
'txt','javascript','.jpg','javascrip']
self.mingan_strings=['.bak','.xml','.config','.mdb','.sql',\
'.dbf','.backup','syntax error','后台登陆']
# ['txt','conf','.ini',]
def run(self):
self.a=urlparse(self.url)
b=self.a[1].split(".") #如www.baidu.com www baidu com
self.Is_target_url = b[1]+"."+b[2] #如 baidu.com
# self.Is_target_url = self.a[1] # 用于判断是否属于目标url 如www.baidu.com
self.target_url=self.a[0]+"://"+self.a[1]+"/" #提取得到目标规范url
num = 0
self.crawl(self.url, num)
@vthread.pool(30)
def crawl(self, url, num):
# print(url)
temp_collect_urls=set() # 用于临时存放收集url,避免一个一个的url阻塞collect_urls
num = num + 1
temp_url_parse=urlparse(url) # 当前页面的url 如news.qq.com
#print(num)
if num <= self.num:
urls = self.getPageLinks(url)
urls=list(set(urls))
# for i in urls:
# print(i)
for i in urls:
i=i.lstrip("http://").lstrip("https://")
i=i.lstrip("://").lstrip("//").rstrip("/")
# 如:www.qq.com news.qq.com aaa.ada.net/aaa
if re.match(r'^https?:/{2}\w.+$', "http://"+i) and self.Is_target_url in i:
i=temp_url_parse[0]+"://"+i
# 处理得到的url 判断是否进行爬取
if i not in self.collect_url and \
(True not in [filter_string.lower() in i.lower() for filter_string in self.filter_strings]) \
and re.match(r'^https?:/{2}\w.+$', i):
temp_collect_urls.add(i)
# 如:aaa/qqq
if not re.match(r'^https?:/{2}\w.+$', "http://"+i) and self.Is_target_url in i:
i=temp_url_parse[0]+"://"+temp_url_parse[1].rstrip("/")+"/"+i
# 处理得到的url 判断是否进行爬取
if i not in self.collect_url and \
(True not in [filter_string.lower() in i.lower() for filter_string in self.filter_strings]) \
and re.match(r'^https?:/{2}\w.+$', i):
temp_collect_urls.add(i)
temp_collect_urls=(set(temp_collect_urls))
# print(temp_collect_urls)
if temp_collect_urls:
for i in temp_collect_urls:
self.collect_url.add(i)
if self.judge(url): #简单判断一下能否进行扫描
scan_target=Scan_target(url)
scan_target.run()
for i in temp_collect_urls:
if ".htm" not in i: #
# print(i)
pass
self.crawl(i, num)
# urls2=re.findall('src=".*?"',s.text)
# print(urls2)
def getPageLinks(self,url):
content = requests.get(url, timeout=6, verify=False, headers=self.headers).text.encode('utf-8')
self.get_mingan_infor(url,content)
links = []
tags = ['a', 'A', 'link', 'script', 'area', 'iframe', 'form'] # img
tos = ['href', 'src', 'action']
if url[-1:] == '/':
url = url[:-1]
for tag in tags:
for to in tos:
link1 = re.findall(r'<%s.*?%s="(.*?)"' % (tag, to), str(content))
link2 = re.findall(r'<%s.*?%s=\'(.*?)\'' % (tag, to), str(content))
for i in link1:
links.append(i)
for i in link2:
if i not in links:
links.append(i)
return links
def deal_url(self,url):
url = url.replace("&amp;", "&")
url = url.replace("#", "")
url = url.replace(" ", "+")
return url

#判断一些能否进行扫描的函数
def judge(self,url):
# ParseResult(scheme='http', netloc='www.baidu.com', path='/aaaa/bbbb/cccc/index.php', params='', query='id=123', fragment='')
url_parse=urlparse(url)
if "=" in url_parse[4] and ".htm" not in url:
temp_a=list
a = re.findall(".*?=", a[4])
for i in a:
if "&" in i:
i = i.split("&")[1]
i = i.replace("=", "")
temp_a.append(i)
else:
i = i.replace("=", "")
temp_a.append(i)
temp_query=''
for i in temp_a:
temp_query = temp_query + "&" + i
temp_url=url_parse[0]+"://"+url_parse[1]+url_parse[2]+"?"+temp_query.lstrip("&")
print("判断前 {}".format(temp_url))
if temp_url not in self.collect_target_url:
print("判断后 {}".format(url))
self.collect_target_url.add(temp_url)
return True
else:
return False
else:
return False

def get_mingan_infor(self,url,content):
content=str(content)
for i in self.mingan_strings:
if i in content:
print("[+]这个{}页面存在({})".format(url,i))

class Scan_target(object):
def __init__(self, url):
self.url=url
def run(self):
print("{} 扫描".format(self.url))


if len(sys.argv) != 2:
print("python crawl.py https://www.baidu.com")
exit()
url = sys.argv[1]
crawl_target(url.rstrip("/")).run()


2019.6.4