运用selenium爬取google和钟馗之眼

  • by

正文:

本文难度系数无,把以前学的代码炒回锅肉
1.钟馗之眼 不清楚 是怎样限制博主的代码的,只能20行数据,但正常登陆就没有问题,疑惑
2.google的验证码是真的恶心
钟馗之眼

#coding=utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import time
import sys
reload( sys )
sys.setdefaultencoding('gbk')
from selenium import webdriver
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
#引入ActionChains鼠标操作类
from selenium.webdriver.common.action_chains import ActionChains
start_url="https://sso.telnet404.com/cas/login/?next=/"
driver=webdriver.Chrome(executable_path="D:/selenium/chrome/chromedriver.exe",service_args=['--ignore-ssl-errors=true', '--ssl-protocol=TLSv1'])
wait=ui.WebDriverWait(driver,20)
urls=[]
#登陆
driver.get(start_url)
wait.until(lambda x:x.find_element_by_xpath('//*[@id="login_form"]/div[2]/input')).send_keys("")
wait.until(lambda x:x.find_element_by_xpath('//*[@id="inputPassword"]')).send_keys("")
time.sleep(10)
driver.get("https://www.zoomeye.org/")
wait.until(lambda x:x.find_element_by_css_selector('#appZoomEye > div > div > div.home-content > div.home-search-body > form > div > div > div > ul > li > div > input')).send_keys("app:phpmyadmin")
wait.until(lambda x:x.find_element_by_css_selector('#appZoomEye > div > div > div.home-content > div.home-search-body > form > img')).click()
for i in range(2,21):
	print "start:"
	for j in range(4,24):
		print j
		url=wait.until(lambda x:x.find_element_by_css_selector('#appZoomEye > div > div > div.content-width.main-content > div > div.search-result-left > div.search-result-list > div:nth-child('+str(j)+') > div.search-result-item-info > a').get_attribute("href"))
		url="http:"+url
		print url
		urls.append(url)
	try:
		ActionChains(driver).click(wait.until(lambda x: x.find_element_by_css_selector("#appZoomEye > div > div > div.content-width.main-content > div > div.search-result-left > div.search-result-list > div.search-result-pagination.clearfix > ul > li.ant-pagination-item.ant-pagination-item-"+str(i) +"> a"))).perform()
	except Exception as e:
		continue
with open("urlss.txt","a+") as f:
	for url in urls[0:]:
		f.write(str(url))
		f.write('\n')
f.close()
driver.quit()

2.
google

#coding=utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import time,random
from selenium import webdriver
import selenium.webdriver.support.ui as ui
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException
#引入ActionChains鼠标操作类
from selenium.webdriver.common.action_chains import ActionChains
#start_url="https://www.google.com/search?q=inurl:login.do&ei=kf_sW9TEDOGs0PEP8LOW4A8&start=50&sa=N&ved=0ahUKEwiU0ZX10NXeAhVhFjQIHfCZBfw4PBDy0wMIhQE&biw=1034&bih=277"
#start_url="https://www.google.com/search?q=inurl:login.do&ei=YxXtW4qkLY-t0PEPvPmluAI&start=240&sa=N&ved=0ahUKEwjK4YTd5dXeAhWPFjQIHbx8CSc45gEQ8tMDCIMB&biw=1034&bih=277"
start_url="https://www.google.com/search?q=inurl:.action%3F&ei=5SvtW6nnB-jB0PEPlPO92Ac&start=50&sa=N&ved=0ahUKEwjp7aOY-9XeAhXoIDQIHZR5D3sQ8tMDCJsB&cshid=1542270136699000&biw=1366&bih=626"
urls=[]
# 6-25 8  25 9
s=[1,2,3,4,5,6,7,8,9,10,11]
driver=webdriver.Chrome(executable_path="D:/selenium/chrome/chromedriver.exe")
wait=ui.WebDriverWait(driver,20)
driver.get(start_url)
for n in range(1,40):
	for j in s[0:]:
		try:
			#//*[@id="b_results"]/li[1]/h2/a 国际版本
			#print wait.until(lambda x:x.find_element_by_xpath('//*[@id="b_results"]/li['+str(j)+']/h2/a').get_attribute("href"))
			#urls[m]=wait.until(lambda x:x.find_element_by_xpath('//*[@id="b_results"]/li['+str(j)+']/h2/a').get_attribute("href"))
			#国内版本
			#rso > div > div > div:nth-child(1) > div > div > div.r > a:nth-child(1)
			#rso > div > div > div:nth-child(2) > div > div > div.r > a:nth-child(1)
			print wait.until(lambda x:x.find_element_by_xpath('//*[@id="rso"]/div/div/div['+str(j)+']/div/div/div[1]/a[1]').get_attribute("href"))
			urls.append(wait.until(lambda x:x.find_element_by_xpath('//*[@id="rso"]/div/div/div['+str(j)+']/div/div/div[1]/a[1]').get_attribute("href")))
		except Exception as e:
			e.message
	time.sleep(random.randint(3,5))
	try:  #nav > tbody > tr > td:nth-child(8) > a
	#nav > tbody > tr > td:nth-child(8) > a
		ActionChains(driver).click(wait.until(lambda x: x.find_element_by_css_selector('#nav > tbody > tr > td:nth-child(9) > a'))).perform()
	except Exception as e:
		e.message
with open("urlss.txt","a+") as f:
	for url in urls[0:]:
		f.write(str(url))
		f.write('\n')
f.close()
driver.quit()

 
 
2018.11.15
 

标签:

发表评论

电子邮件地址不会被公开。 必填项已用*标注