40天训练-第6天-4-信息处理篇-子域名爆破、版本识别、cms识别、目录爆破、url爬取数据库处理

  • by

休息了两天,出去耍一天,真的是一日三餐,早饭随便吃点,就准备中午菜,溜达溜达就可以准备晚饭了,个人学习效率最高的时候往往是晚上了

个人感受:

调了代码,改成存储数据库;效率面前,个人认输,实在不知道还能怎么处理,先将就着用收集到的样本吧(1500个目标跑了450个,跑子域名跑出来18w个,用了那么久时间;18w子域名测存活量与测容器与中间件,测出来7w个能用,用了那么久时间;7w个子域名爆破网站目录,就跑了一会,就没有继续了,大概就跑了60个target。。。到时候就选常用的200个主页面目录字典再重新测试吧,原本字典用的dirb的;最后一步爬取链接

这些操作都是无序的,只需拿到链接样本,进行相应的处理与筛选即可(可分类,只不过还是老问题,量大后,同步问题很麻烦,不如最后筛选,不影响目的)

效率问题呀,真是现实

技术总结:

1.数据库连接,建议小批量插入,重点:小、批量

2.字典问题:

kali下有很多字典可以用:find / -name wordlist 或者 find / -name wordlists

这里个人子域名字典用的lijiejie前辈的subdomains_full.txt(15000),网站目录字典用的dirb的common.txt(4700),建议网站目录再缩小到200(个人也打算接下来如此操作)

3.也没有再用whatweb来进行操作,kaili虚拟机whatweb开100或者50线程,就会被卡顿,连接不上网络;

这里就简单用requests的返回.headers[‘Servers’]来进行容器判断;请求7001/console进行weblogic判断,请求8080/manage进行tomcat判断

本文就4个脚本,需要相应处理脚本,可自行在前面几篇文章找寻

收集子域名:

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import gevent
from gevent import monkey, pool

monkey.patch_all()
import multiprocessing
from multiprocessing import Pool
import dns.resolver
import time
import codecs
import queue
import threadpool

import sys, os

sys.path.append(os.path.abspath('../libs'))
from load_dict import load_suffix, load_domain_dict, load_dns_domains, load_target_domains

import pymysql

# 请求dns服务器 查看域名是否存在
def domain_query(domain=None, dns_servers=None):
    try:
        # print(domain)
        # resolver.nameservers=dns_servers[random.randint(0,20)]
        ans = resolver.query(domain)
        if ans:
            ips = ', '.join(sorted([i.address for i in ans]))
            # print("success:"+domain)
            if "0.0.0.1" not in ips and "127.0.0.1" not in ips and "0.0.0.0" not in ips:
                print(domain + ":" + ips)
                domain_results.append(domain + ":" + ips)
                # subdomains_nums=subdomains_nums+1
                # print(subdomains_nums)
    except:
        pass
        # print("fail:" + domain)


def test(subdomain):
    print(subdomain)
    domain_results.add(subdomain)

# 扫描每个目标的子域名
def scan_subdomain(domain):
    global domain_results
    global Insert_subdomains_Sql
    if len(domain_results)>10000:
        print(len(domain_results))
        domain_results2=domain_results
        domain_results = list()
        nums = int(str(len(domain_results2) / 2000).split(".")[0])
        print(nums)
        for i in range(nums):
            if i == (nums - 1):
                temp_domain_results = domain_results2[2000 * i:]
            else:
                temp_domain_results = domain_results2[2000 * i:2000 * (i + 1)]
            for temp_domain_result in temp_domain_results:
                Insert_subdomains_Sql = Insert_subdomains_Sql + "(\'" + temp_domain_result + "\'),"
            Insert_subdomains_Sql = Insert_subdomains_Sql.rstrip(",")
            db = pymysql.connect(host='localhost', user='root', password='root', db='Target_info', port=3306)
            # global cursor
            cursor = db.cursor()
            try:
                # if cursor.execute(sql, tuple(data.values())):
                if cursor.execute(Insert_subdomains_Sql):
                    print("Successful")
                    db.commit()
                    temp_domain_results = list()
                    # domain_results = list()
                    Insert_subdomains_Sql = " insert into subdomains (subdomains) values "
            except:
                print("Failed")
                db.rollback()
                # 保存最后的结果
                with codecs.open("../store/deal/不带冒号的文件.txt", "a+") as f:
                    for temp_domain_result in temp_domain_results:
                        f.write(temp_domain_result)
                        f.write("\n")
                f.close()
                temp_domain_results = list()
                Insert_subdomains_Sql = " insert into subdomains (subdomains) values "
        db.close()


    # domain_results.add(domain)
    # domain_results.append(domain)
    scan_pool = pool.Pool(50)
    # gevent_list = [gevent.spawn(domain_query, (domain_dict + "." + domain)) for domain_dict in domain_dicts]
    gevent_list = [scan_pool.spawn(domain_query, (domain_dict + "." + domain)) for domain_dict in domain_dicts]
    gevent.joinall(gevent_list)


if __name__ == '__main__':
    start_time = time.time()

    # 装载字典
    global domain_results
    domain_results = list()
    # domain_results=list()
    global domain_dicts
    domain_dicts = load_domain_dict()  # 得到子域名爆破前缀字典 2s
    global dns_servers
    dns_servers = load_dns_domains()  # 得到域名服务器
    global suffix_list
    suffix_list = load_suffix()  # 得到各国域名后缀
    domains = load_target_domains(suffix_list)  # 得到目标字典
    domains=domains[451:]
    # global subdomains_nums
    # subdomains_nums=0

    global resolver
    resolver = dns.resolver.Resolver()
    # dns_servers = ['114.114.114.114', '8.8.8.8', '223.5.5.5', '223.6.6.6', '119.29.29.29', '182.254.116.116']
    resolver.lifetime = resolver.timeout = 6.0
    resolver.nameservers = dns_servers  # 默认['114.114.114.114', '8.8.8.8']

    global Insert_subdomains_Sql
    Insert_subdomains_Sql=" insert into subdomains (subdomains) values "

    # for domain_dict in domain_dicts:
    #     scan_subdomain(domain_dict)


    thread_pool = threadpool.ThreadPool(30)
    reqs = threadpool.makeRequests(scan_subdomain, domains)
    [thread_pool.putRequest(req) for req in reqs]  # 多线程一块执行
    thread_pool.wait()  # 线程挂起,直到结束

    db = pymysql.connect(host='localhost', user='root', password='root', db='Target_info', port=3306)
    # global cursor
    cursor = db.cursor()
    # 将剩下的插入数据库
    for domain_result in domain_results:
        Insert_subdomains_Sql = Insert_subdomains_Sql + "(\'" + domain_result + "\'),"
    Insert_subdomains_Sql = Insert_subdomains_Sql.rstrip(",")
    try:
        # if cursor.execute(sql, tuple(data.values())):
        if cursor.execute(Insert_subdomains_Sql):
            print("Successful")
            db.commit()
            # domain_results = set()
            domain_results = list()
            Insert_subdomains_Sql = " insert into subdomains (subdomains) values "
    except:
        print("Failed")
        db.rollback()
        # 保存最后的结果
        with codecs.open("../store/deal/不带冒号的文件.txt", "a+") as f:
            for domain_result in domain_results:
                f.write(domain_result)
                f.write("\n")
        f.close()
        domain_results = set()
        Insert_subdomains_Sql = " insert into subdomains (subdomains) values "
    # 关闭数据库连接
    db.close()
    end_time = time.time()
    print(end_time - start_time)

目标容器与中间件识别:

#!/usr/bin/env python
# -*- encoding: utf-8 -*-

import gevent
from gevent import monkey, pool

monkey.patch_all()
import sys, os

sys.path.append(os.path.abspath('../libs'))
from load_dict import load_domain_results,load_middleware_tomcat,load_middleware_weblogic
import codecs
import requests
import threadpool
import pymysql

def Judge_middleware(url):
    # print(target)
    if "&&" in url:
        target=url.split("&&")[0]
    else:
        target=url
    num = 0
    # weblogic
    try:
        s = requests.get(target + ":7001", timeout=6)
        if s.status_code == 200:
            for i in weblogic_fuzz:
                try:
                    s = requests.get(target + ":7001/" + i, timeout=6)
                    if s.status_code != 404:
                        num = num + 1
                    if num >= 3:
                        print(url + "$$" + "weblogic")
                        info_results.append(url + "$$" + "weblogic")
                        break
                except:
                    pass
    except:
        pass
    # tomcat
    if num==0:
        try:
            s = requests.get(target + ":8080", timeout=6)
            if s.status_code == 200:
                for i in tomcat_fuzz:
                    try:
                        s = requests.get(target + ":8080/" + i, timeout=6)
                        if s.status_code != 404:
                            num = num + 1
                        if num >= 3:
                            print(url + "$$" + "tomcat")
                            info_results.append(url + "$$" + "tomcat")
                            break
                    except:
                        pass
        except:
            pass
    # 不是weblogic与tomcat
    if num<=2:
        print(url)
        info_results.append(url)

def Judge_middleware2(url):
    if "&&" in url:
        target=url.split("&&")[0]
    else:
        target=url
    num=0
    try:
        s = requests.get(target + ":7001/console" ,timeout=3)
        if s.status_code != 404:
            print(url + "$$" + "weblogic")
            info_results.append(url + "$$" + "weblogic")
            num=1
    except:
        pass
    if num==0:
        try:
            s = requests.get(target + ":8080/manager/html" , timeout=3)
            if s.status_code != 404:
                print(url + "$$" + "tomcat")
                info_results.append(url + "$$" + "tomcat")
        except:
            pass
    print(url)
    info_results.append(url)



def first_judge_http_https(i):
    try:
        target = "http://" + i
        # print("test:"+target)
        s=requests.get(target)
        try:
            if s.headers["Server"]:
                target=target + "$$" + s.headers["Server"]
        except:
            pass
        finally:
            Judge_middleware2(target)
            # print(target)
            # info_results.append(target)
    except:
        try:
            target = "https://" + i
            s = requests.get(target)
            try:
                if s.headers["Server"]:
                    target = target + "$$" + s.headers["Server"]
            except:
                pass
            finally:
                Judge_middleware2(target)
                # print(target)
                # info_results.append(target)
        except:
            pass
    finally:
        pass

def gevent_thread_run(i):
    global info_results
    global thread_nums
    global Insert_subdomains_Sql
    if len(info_results)>10000:
        print(len(info_results))
        info_results2=info_results
        info_results = list()
        nums = int(str(len(info_results2) / 2000).split(".")[0])
        print(nums)
        for i in range(nums):
            if i == (nums - 1):
                temp_info_results = info_results2[2000 * i:]
            else:
                temp_info_results = info_results2[2000 * i:2000 * (i + 1)]
            for temp_info_result in temp_info_results:
                Insert_subdomains_Sql = Insert_subdomains_Sql + "(\'" + temp_info_result + "\'),"
            Insert_subdomains_Sql = Insert_subdomains_Sql.rstrip(",")
            db = pymysql.connect(host='localhost', user='root', password='root', db='Target_info', port=3306)
            # global cursor
            cursor = db.cursor()
            try:
                # if cursor.execute(sql, tuple(data.values())):
                if cursor.execute(Insert_subdomains_Sql):
                    print("Successful")
                    db.commit()
                    temp_info_result = list()
                    # domain_results = list()
                    Insert_subdomains_Sql = " insert into info_result (info_result) values "
            except:
                print("Failed")
                db.rollback()
                # 保存最后的结果
                with codecs.open("../store/deal/info_result.txt", "a+") as f:
                    for temp_info_result in temp_info_results:
                        f.write(temp_info_result)
                        f.write("\n")
                f.close()
                temp_info_results = list()
                Insert_subdomains_Sql = " insert into info_result (info_result) values "
        db.close()
    # 装载请求数据
    if i == (thread_nums - 1):
        temp_domain_results = subdomains[2000 * i:]
    else:
        temp_domain_results = subdomains[2000 * i:2000 * (i + 1)]
    # for temp_domain_result in temp_domain_results:
    #     info_results.append(temp_domain_result)
    # print(temp_domain_results[1:30])
    scan_pool = gevent.pool.Pool(50)
    # gevent_list = [gevent.spawn(domain_query, (domain_dict + "." + domain)) for domain_dict in domain_dicts]
    gevent_list = [scan_pool.spawn(first_judge_http_https, domain_dict) for domain_dict in temp_domain_results]
    gevent.joinall(gevent_list)

if __name__ == '__main__':
    # global tomcat_fuzz
    # tomcat_fuzz = load_middleware_tomcat()
    # tomcat_fuzz = tomcat_fuzz[1:30]
    # global weblogic_fuzz
    # weblogic_fuzz = load_middleware_weblogic()
    # weblogic_fuzz = weblogic_fuzz[1:30]
    # 载入subdomains
    subdomains = load_domain_results()
    # subdomains=subdomains[12000:]
    #
    global Insert_subdomains_Sql
    Insert_subdomains_Sql=" insert into info_result (info_result) values "
    #
    global info_results
    info_results = []
    # 1 thread -> 10000 url
    global thread_nums
    thread_nums=len(subdomains)
    a = int(str(thread_nums/ 2000).split(".")[0]) + 1
    b=range(a)
    print("分成:"+str(a))
    thread_pool = threadpool.ThreadPool(20)
    reqs = threadpool.makeRequests(gevent_thread_run, b)
    [thread_pool.putRequest(req) for req in reqs]  # 多线程一块执行
    thread_pool.wait()  # 线程挂起,直到结束

    # 将剩下info_result数据的装载入数据库
    db = pymysql.connect(host='localhost', user='root', password='root', db='Target_info', port=3306)
    # global cursor
    cursor = db.cursor()
    # 将剩下的插入数据库
    for info_result in info_results:
        Insert_subdomains_Sql = Insert_subdomains_Sql + "(\'" + info_result + "\'),"
    Insert_subdomains_Sql = Insert_subdomains_Sql.rstrip(",")
    try:
        # if cursor.execute(sql, tuple(data.values())):
        if cursor.execute(Insert_subdomains_Sql):
            print("Successful")
            db.commit()
            # domain_results = set()
            info_results = list()
            Insert_subdomains_Sql = " insert into info_result (info_result) values "
    except:
        print("Failed")
        db.rollback()
        # 保存最后的结果
        with codecs.open("../store/deal/info_result.txt", "a+") as f:
            for info_result in info_results:
                f.write(info_result)
                f.write("\n")
        f.close()
        info_results = list()
        Insert_subdomains_Sql = " insert into info_result (info_result) values "
    # 关闭数据库连接
    db.close()

网站目录爆破

#!/usr/bin/env python
# -*- encoding: utf-8 -*-

import gevent
from gevent import monkey, pool

monkey.patch_all()
import sys, os

sys.path.append(os.path.abspath('../libs'))
from load_dict import load_domain_results,load_middleware_tomcat,load_middleware_weblogic
import codecs
import requests
import threadpool
import pymysql

def Judge_middleware(url):
    # print(target)
    if "&&" in url:
        target=url.split("&&")[0]
    else:
        target=url
    num = 0
    # weblogic
    try:
        s = requests.get(target + ":7001", timeout=6)
        if s.status_code == 200:
            for i in weblogic_fuzz:
                try:
                    s = requests.get(target + ":7001/" + i, timeout=6)
                    if s.status_code != 404:
                        num = num + 1
                    if num >= 3:
                        print(url + "$$" + "weblogic")
                        info_results.append(url + "$$" + "weblogic")
                        break
                except:
                    pass
    except:
        pass
    # tomcat
    if num==0:
        try:
            s = requests.get(target + ":8080", timeout=6)
            if s.status_code == 200:
                for i in tomcat_fuzz:
                    try:
                        s = requests.get(target + ":8080/" + i, timeout=6)
                        if s.status_code != 404:
                            num = num + 1
                        if num >= 3:
                            print(url + "$$" + "tomcat")
                            info_results.append(url + "$$" + "tomcat")
                            break
                    except:
                        pass
        except:
            pass
    # 不是weblogic与tomcat
    if num<=2:
        print(url)
        info_results.append(url)

def Judge_middleware2(url):
    if "&&" in url:
        target=url.split("&&")[0]
    else:
        target=url
    num=0
    try:
        s = requests.get(target + ":7001/console" ,timeout=3)
        if s.status_code != 404:
            print(url + "$$" + "weblogic")
            info_results.append(url + "$$" + "weblogic")
            num=1
    except:
        pass
    if num==0:
        try:
            s = requests.get(target + ":8080/manager/html" , timeout=3)
            if s.status_code != 404:
                print(url + "$$" + "tomcat")
                info_results.append(url + "$$" + "tomcat")
        except:
            pass
    print(url)
    info_results.append(url)



def first_judge_http_https(i):
    try:
        target = "http://" + i
        # print("test:"+target)
        s=requests.get(target)
        try:
            if s.headers["Server"]:
                target=target + "$$" + s.headers["Server"]
        except:
            pass
        finally:
            Judge_middleware2(target)
            # print(target)
            # info_results.append(target)
    except:
        try:
            target = "https://" + i
            s = requests.get(target)
            try:
                if s.headers["Server"]:
                    target = target + "$$" + s.headers["Server"]
            except:
                pass
            finally:
                Judge_middleware2(target)
                # print(target)
                # info_results.append(target)
        except:
            pass
    finally:
        pass

def gevent_thread_run(i):
    global info_results
    global thread_nums
    global Insert_subdomains_Sql
    if len(info_results)>10000:
        print(len(info_results))
        info_results2=info_results
        info_results = list()
        nums = int(str(len(info_results2) / 2000).split(".")[0])
        print(nums)
        for i in range(nums):
            if i == (nums - 1):
                temp_info_results = info_results2[2000 * i:]
            else:
                temp_info_results = info_results2[2000 * i:2000 * (i + 1)]
            for temp_info_result in temp_info_results:
                Insert_subdomains_Sql = Insert_subdomains_Sql + "(\'" + temp_info_result + "\'),"
            Insert_subdomains_Sql = Insert_subdomains_Sql.rstrip(",")
            db = pymysql.connect(host='localhost', user='root', password='root', db='Target_info', port=3306)
            # global cursor
            cursor = db.cursor()
            try:
                # if cursor.execute(sql, tuple(data.values())):
                if cursor.execute(Insert_subdomains_Sql):
                    print("Successful")
                    db.commit()
                    temp_info_result = list()
                    # domain_results = list()
                    Insert_subdomains_Sql = " insert into info_result (info_result) values "
            except:
                print("Failed")
                db.rollback()
                # 保存最后的结果
                with codecs.open("../store/deal/info_result.txt", "a+") as f:
                    for temp_info_result in temp_info_results:
                        f.write(temp_info_result)
                        f.write("\n")
                f.close()
                temp_info_results = list()
                Insert_subdomains_Sql = " insert into info_result (info_result) values "
        db.close()
    # 装载请求数据
    if i == (thread_nums - 1):
        temp_domain_results = subdomains[2000 * i:]
    else:
        temp_domain_results = subdomains[2000 * i:2000 * (i + 1)]
    # for temp_domain_result in temp_domain_results:
    #     info_results.append(temp_domain_result)
    # print(temp_domain_results[1:30])
    scan_pool = gevent.pool.Pool(50)
    # gevent_list = [gevent.spawn(domain_query, (domain_dict + "." + domain)) for domain_dict in domain_dicts]
    gevent_list = [scan_pool.spawn(first_judge_http_https, domain_dict) for domain_dict in temp_domain_results]
    gevent.joinall(gevent_list)

if __name__ == '__main__':
    # global tomcat_fuzz
    # tomcat_fuzz = load_middleware_tomcat()
    # tomcat_fuzz = tomcat_fuzz[1:30]
    # global weblogic_fuzz
    # weblogic_fuzz = load_middleware_weblogic()
    # weblogic_fuzz = weblogic_fuzz[1:30]
    # 载入subdomains
    subdomains = load_domain_results()
    # subdomains=subdomains[12000:]
    #
    global Insert_subdomains_Sql
    Insert_subdomains_Sql=" insert into info_result (info_result) values "
    #
    global info_results
    info_results = []
    # 1 thread -> 10000 url
    global thread_nums
    thread_nums = int(str(len(subdomains)/ 2000).split(".")[0]) + 1
    b=range(thread_nums)
    print("分成:"+str(thread_nums))
    thread_pool = threadpool.ThreadPool(20)
    reqs = threadpool.makeRequests(gevent_thread_run, b)
    [thread_pool.putRequest(req) for req in reqs]  # 多线程一块执行
    thread_pool.wait()  # 线程挂起,直到结束

    # 将剩下info_result数据的装载入数据库
    db = pymysql.connect(host='localhost', user='root', password='root', db='Target_info', port=3306)
    # global cursor
    cursor = db.cursor()
    # 将剩下的插入数据库
    for info_result in info_results:
        Insert_subdomains_Sql = Insert_subdomains_Sql + "(\'" + info_result + "\'),"
    Insert_subdomains_Sql = Insert_subdomains_Sql.rstrip(",")
    try:
        # if cursor.execute(sql, tuple(data.values())):
        if cursor.execute(Insert_subdomains_Sql):
            print("Successful")
            db.commit()
            # domain_results = set()
            info_results = list()
            Insert_subdomains_Sql = " insert into info_result (info_result) values "
    except:
        print("Failed")
        db.rollback()
        # 保存最后的结果
        with codecs.open("../store/deal/info_result.txt", "a+") as f:
            for info_result in info_results:
                f.write(info_result)
                f.write("\n")
        f.close()
        info_results = list()
        Insert_subdomains_Sql = " insert into info_result (info_result) values "
    # 关闭数据库连接
    db.close()

动态爬取urls:

# coding=utf-8
import gevent
from gevent import monkey, pool
monkey.patch_all()

import requests
import re
from urllib.parse import urlparse
import sys
import os
import threading
import threadpool

import pymysql
import codecs
mutex1 = threading.RLock()
import sys

sys.path.append(os.path.abspath('../libs'))
from load_dict import load_broken_urls
from common import threadPoolManager

class crawl(object):
    def __init__(self, url, max_num=5):
        self.url = url
        self.collect_url = []
        self.max_num = max_num
        self.suffix = [".png", "jpg", "gif", "css", "js", "ico", "javascript", ".PNG", "JPG", "GIF", "CSS", "JS", "ICO",
                       "JAVASCRIPT"]

    def run(self):
        self.Is_url = self.deal_url(self.url)
        # self.Is_http_s = self.Is_http_or_https(self.url)
        self.Is_http_s=self.url.split("://")[0]+"://"
        num = 0
        self.crawl(self.url, num)
        # self.file_save()

    def crawl(self, url, num):
        num = num + 1
        if num != self.max_num:
            try:
                s = requests.get(url=url)
                self.get_url(s.text, num)
            except:
                pass

    # 得到根域名 www.qq.com
    def deal_url(self, url):
        res = urlparse(url).netloc
        return res

    # 存储结果
    def file_save(self):
        if not os.path.exists("target/" + self.Is_url):
            os.mkdir("target/" + self.Is_url)
        with open("target/" + self.Is_url + "/" + self.Is_url + ".txt", "w+") as f:
            for url in self.collect_url:
                f.write(url)
                f.write("\n")
        f.close()

    # 处理href等标签
    def deal_href_link_src_import(self, url, string):
        url = url.strip(string)
        url = url.strip("\"")
        url = url.strip("'")
        url = url.strip("/")
        return url

    # 得到处理后的urls
    def get_url(self, content, num):
        urls1 = re.findall('href=".*?"', content)
        urls2 = re.findall("href=\'.*?\'", content)
        urls3 = re.findall('src=".*?"', content)
        urls4 = re.findall("src=\'.*?\'", content)
        # return urls1 + urls2 + urls3 + urls4
        urls = urls1 + urls2 + urls3 + urls4
        for i in urls:
            if self.Is_url in i:
                # <a href
                if "href=" in i:
                    i = self.deal_href_link_src_import(i, "href=")
                # <xxx src=
                if "src=" in i:
                    i = self.deal_href_link_src_import(i, "src=")
                if (i.split(".")[-1] not in self.suffix):
                    if i not in self.collect_url:
                        if "http" not in i:
                            i = self.Is_http_s + i
                        print(i)
                        # mutex1.acquire()
                        self.collect_url.append(i)
                        info_results.append(i)
                        # mutex1.release()
                        self.crawl(i, num)

    # 判断是http还是https
    def Is_http_or_https(self, url):
        if "http" in url:
            return url.split("://")[0] + "://"
        try:
            requests.get("https://" + self.Is_url, timeout=3)
            return "https://"
        except:
            return "http://"


    # 有很多html或者相同目录下的链接 可以去除
    # def deal_url(self):
    #     pass

# url="http://www.naivete.online"
# C = crawl(url)
# C.run()
def first_setup_crawl(i):
    global info_results
    global thread_nums
    global Insert_subdomains_Sql
    if len(info_results) > 10000:
        print(len(info_results))
        info_results2 = info_results
        info_results = list()
        nums = int(str(len(info_results2) / 2000).split(".")[0])
        print(nums)
        for i in range(nums):
            if i == (nums - 1):
                temp_info_results = info_results2[2000 * i:]
            else:
                temp_info_results = info_results2[2000 * i:2000 * (i + 1)]
            for temp_info_result in temp_info_results:
                Insert_subdomains_Sql = Insert_subdomains_Sql + "(\'" + temp_info_result + "\'),"
            Insert_subdomains_Sql = Insert_subdomains_Sql.rstrip(",")
            db = pymysql.connect(host='localhost', user='root', password='root', db='Target_info', port=3306)
            # global cursor
            cursor = db.cursor()
            try:
                # if cursor.execute(sql, tuple(data.values())):
                if cursor.execute(Insert_subdomains_Sql):
                    print("Successful")
                    db.commit()
                    temp_info_result = list()
                    # domain_results = list()
                    Insert_subdomains_Sql = " insert into crawl_url (crawl_url) values "
            except:
                print("Failed")
                db.rollback()
                # 保存最后的结果
                with codecs.open("../store/deal/info_result.txt", "a+") as f:
                    for temp_info_result in temp_info_results:
                        f.write(temp_info_result)
                        f.write("\n")
                f.close()
                temp_info_results = list()
                Insert_subdomains_Sql = " insert into crawl_url (crawl_url) values "
        db.close()
    # 装载请求数据
    if i == (thread_nums - 1):
        temp_domain_results = urls_list[2000 * i:]
    else:
        temp_domain_results = urls_list[2000 * i:2000 * (i + 1)]
    # for temp_domain_result in temp_domain_results:
    #     info_results.append(temp_domain_result)
    # print(temp_domain_results[1:30])
    scan_pool = gevent.pool.Pool(50)
    # gevent_list = [gevent.spawn(domain_query, (domain_dict + "." + domain)) for domain_dict in domain_dicts]
    gevent_list = [scan_pool.spawn(crawl(url).run()) for url in temp_domain_results]
    gevent.joinall(gevent_list)

if __name__ == '__main__':

    global info_results
    info_results=list()

    global urls_list
    urls_list = []
    urls_list = load_broken_urls()
    #
    global Insert_subdomains_Sql
    Insert_subdomains_Sql = " insert into crawl_url (crawl_url) values "
    print(len(urls_list))
    global thread_nums
    thread_nums = int(str(len(urls_list) / 2000).split(".")[0]) + 1
    b = range(thread_nums)
    print("分成:" + str(thread_nums))

    thread_pool = threadpool.ThreadPool(50)
    reqs = threadpool.makeRequests(first_setup_crawl, b)
    [thread_pool.putRequest(req) for req in reqs]  # 多线程一块执行
    thread_pool.wait()  # 线程挂起,直到结束

    # 将剩下info_result数据的装载入数据库
    db = pymysql.connect(host='localhost', user='root', password='root', db='Target_info', port=3306)
    # global cursor
    cursor = db.cursor()
    # 将剩下的插入数据库
    for info_result in info_results:
        Insert_subdomains_Sql = Insert_subdomains_Sql + "(\'" + info_result + "\'),"
    Insert_subdomains_Sql = Insert_subdomains_Sql.rstrip(",")
    try:
        # if cursor.execute(sql, tuple(data.values())):
        if cursor.execute(Insert_subdomains_Sql):
            print("Successful")
            db.commit()
            # domain_results = set()
            info_results = list()
            Insert_subdomains_Sql = " insert into crawl_url (crawl_url) values "
    except:
        print("Failed")
        db.rollback()
        # 保存最后的结果
        with codecs.open("../store/deal/info_result.txt", "a+") as f:
            for info_result in info_results:
                f.write(info_result)
                f.write("\n")
        f.close()
        info_results = list()
        Insert_subdomains_Sql = " insert into crawl_url (crawl_url) values "
    # 关闭数据库连接
    db.close()

2020.1.19

发表评论

电子邮件地址不会被公开。 必填项已用*标注