- selenium实现百度登陆与指数提取
- -- coding: utf-8 --import datetimeimport osimport reimport timeimport urllibfrom time import sleepimport requestsfrom selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support import expectedconditions as ECfrom selenium.webdriver.support.ui import WebDriverWaitfrom utils.regocr import getverifyclass BDCrawler(object): def __init(self, userName, pwd, chromedriver, data_dir): ''' 初始化 driver :param userName:账户 :param pwd: 密码 :param chromedriver:webdirver ''' os.environ["webdriver.chrome.driver"] = chromedriver options = webdriver.ChromeOptions() options.add_argument('—user-data-dir=' + data_dir) # 设置成用户自己的数据目录 # option.add_argument('—user-agent=iphone') #修改浏览器的User-Agent来伪装你的浏览器访问手机m站 # option.add_extension('d:\crx\AdBlock_v2.17.crx') # 自己下载的crx路径 self.driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=options) # self.driver = webdriver.Chrome(chromedriver) self.userName = str(userName) self.pwd = pwd def login(self): ''' 判断是否需要登录 :return: ''' try: if self.driver.find_elements_by_class_name('compInfo'): return True WebDriverWait(self.driver, 30).until( EC.presence_of_element_located((By.NAME, 'userName')) ) userName = self.driver.find_element_by_name("userName") userName.send_keys(self.userName) password = self.driver.find_element_by_name("password") password.send_keys(self.pwd) submit = self.driver.find_element_by_xpath('//[@class="pass-form-item pass-form-item-submit"]') submit.submit() print '请检查是否有验证码,手动输入' sleep(10) while self.driver.find_element_by_xpath('//[@class="pass-verifyCode"]'): try: if self.driver.find_element_by_xpath( '//[@class="pass-success pass-success-verifyCode" and @style="display: block; visibility: visible; opacity: 1;"]'): submit = self.driver.find_element_by_xpath('//[@class="pass-form-item pass-form-item-submit"]') submit.submit() break except: pass sleep(6) return True except: return False pass else: return False def getTimeSpan(self): try: self.driver.implicitly_wait(20) # seconds # 获取时间范围 time_span = self.driver.find_elements_by_class_name('compInfo')[4].text from_day = time_span.split()[0] to_day = time_span.split()[-1] from_daytime = datetime.datetime.strptime(from_day, "%Y-%m-%d").date() to_daytime = datetime.datetime.strptime(to_day, "%Y-%m-%d").date() alldays = (to_daytime - from_daytime).days self.driver.implicitly_wait(5) # seconds return from_daytime, alldays except: return -1 def downloadImageFile(self, keyword, imgUrl): local_filename = '../photos/' + keyword + time.strftime('%Y%m%d') + '.png' print "Download Image File=", localfilename r = requests.get(imgUrl, cookies=self.getCookieJson(), stream=True) # here we need to set stream = True parameter with open(localfilename, 'wb') as f: for chunk in r.itercontent(chunksize=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() f.close() return local_filename def getCookieJson(self): cookie_jar = {} for cookie in self.driver.get_cookies(): name = cookie['name'] value = cookie['value'] cookie_jar[name] = value return cookie_jar def webcrawler(self, keyword): ''' 百度指数控制函数 :return: ''' self.driver.get(' http://index.baidu.com/?tpl=trend&word=' ; + urllib.quote(keyword.encode('gb2312'))) # self.driver.maximize_window() self.login() try: # 判断form表单ajax加载完成标记:id属性 WebDriverWait(self.driver, 20).until( EC.presence_of_element_located((By.ID, 'trend')) ) except: pass try: for i in range(1, 4): if self.driver.find_element_by_class_name('toLoading').get_attribute('style') != u'display: none;': break sleep(5) except: pass for i in range(1, 4): from_daytime, alldays = self.getTimeSpan() if alldays < 0: self.driver.refresh() else: break sleep(2) for i in range(1, 4): try: # 获取所有的纵坐标的点 svg_data = re.search(r'<path fill="none" stroke="#3ec7f5"(.?)" stroke-width="2" stroke-opacity="1"', self.driver.page_source).group(1) break except: self.driver.refresh() sleep(2) pass points = [] for line in re.split('C', svg_data): tmp_point = re.split(',', line)[-1] points.append(tmp_point) # 判断时间天数差 与 points个数 是否一致 if len(points) != alldays + 1: return # self.driver.save_screenshot('aa.png') # 截取当前网页 CaptchaUrl = self.driver.find_element_by_id('trendYimg').get_attribute('src') # 定位坐标尺度 pic_path = self.downloadImageFile(urllib.quote(keyword.encode('gb2312')), CaptchaUrl) reg_txt = getverify(pic_path) MaxValue = float(reg_txt.split()[0].replace(',', '')) MinValue = float(reg_txt.split()[-1].replace(',', '')) kedu = (MaxValue - MinValue) / (len(reg_txt.split()) - 1) indexValue = [] for index, point in enumerate(points): day = from_daytime + datetime.timedelta(days=index) Xvalue = MaxValue - (float(point) - 130) (MaxValue - (MinValue - kedu)) / 207.6666666 indexValue.append({'day': day, 'value': Xvalue}) return indexValuedef test_ocr(): pic_path = r'..\photos\love_2016_08_31.png' reg_txt = getverify(pic_path)if __name == '__main': test_ocr()
- -- coding: utf-8 --from PIL import Imagefrom PIL import ImageEnhancefrom pytesseract import def getverify(name): ''' 图片识别模块 :param name:图片path :return: ''' # 打开图片 im = Image.open(name) # 使用ImageEnhance可以增强图片的识别率 enhancer = ImageEnhance.Contrast(im) image_enhancer = enhancer.enhance(4) # 放大图像 方便识别 im_orig = image_enhancer.resize((image_enhancer.size[0] 2, imageenhancer.size[1] * 2), Image.BILINEAR) # 识别 text = imagetostring(imorig) im.close() im_orig.close() # 识别对吗 text = text.strip() return text# 验证码识别,此程序只能识别数据验证码if __name == '__main': getverify('trendYimg.png') # 注意这里的图片要和此文件在同一个目录,要不就传绝对路径也行
- -- coding: utf-8 --import base64import yamlfrom BDSpider.bdspider import BDCrawlerdef main(): stream = file('../config/setting.yaml', 'r') # 'document.yaml' contains a single YAML document. pyconfig = yaml.load(stream) bdcrawler = BD_Crawler(pyconfig['Authentication']['name'], base64.b64decode(pyconfig['Authentication']['pwd']), pyconfig['chromedriver'], pyconfig['data_dir']) values = bd_crawler.webcrawler(u'爱情') print values print 'end'if __name == '__main': main()
selenium实现百度登陆与指数提取
需求
http://index.baidu.com/?tpl=trend&word=itools
三次贝塞尔曲线分析
M20,209.71043771043767C20,209.71043771043767,37.403792291215744,202.91564252551038,54.793103448275865,203.41750841750837C72.18241460533599,203.91937430950637,73.20310901938423,205.95708808566553,89.58620689655173,211.80808080808077C105.96930477371923,217.65907353049602,107.03607008639847,227.22833278337725,124.37931034482759,228.58922558922558C141.7225506032567,229.9501183950739,142.51636522414827,213.07945078549736,159.17241379310346,218.10101010101005C175.82846236205864,223.12256941652274,178.44993667348015,252.18578642065503,193.96551724137933,260.05387205387206C209.4810978092785,267.92195768708905,211.42970775670906,263.68409090917896,228.75862068965517,262.1515151515151C246.0875336226013,260.6189393938513,246.2535151946967,251.9137883963013,263.55172413793105,253.76094276094273C280.8499330811654,255.60809712558415,280.953233267576,270.9573693100153,298.3448275862069,270.54208754208753C315.7364219048378,270.1268057741598,318.03951864381804,260.3049404318127,333.1379310344828,251.66329966329965C348.23634342514754,243.02165889478658,352.22734445560417,238.1724636145718,367.93103448275866,230.68686868686865C383.63472450991316,223.2012737591655,385.3765270266145,216.79701324539502,402.72413793103453,218.10101010101005C420.0717488354545,219.40500695662507,420.2680561610064,234.72023581086987,437.51724137931035,236.97979797979792C454.7664265976143,239.239360148726,459.27137746988086,240.10553324989965,472.3103448275862,228.58922558922558C485.3493121852916,217.0729179285515,489.7547176782527,135.00391473049126,507.1034482758621,136.29292929292924C524.4521788734716,137.58194385536723,529.3926083978823,285.7163490857493,541.896551724138,297.8114478114478C554.4004950503937,309.90654653714626,559.2931034482759,304.1043771043771,576.6896551724138,304.1043771043771C594.0862068965517,304.1043771043771,594.0934474636296,297.30958191944984,611.4827586206897,297.8114478114478C628.8720697777499,298.3133137034458,630.6164737870624,298.6241860439508,646.2758620689656,306.20202020202015C661.9352503508686,313.77985436008953,664.461123342158,328.29261708828153,681.0689655172414,333.4713804713804C697.6768076923248,337.66666666666663,698.9227646961804,335.33605774965736,715.8620689655173,331.37373737373736C732.8013732348542,327.41141699781735,733.5081181472079,319.62597101028564,750.6551724137931,316.6902356902357C767.8022266803783,313.7545003701857,768.1759444027329,316.7126491983071,785.4482758620691,318.78787878787875C802.7206073214052,320.86310837745043,802.8520681532848,325.582673972806,820.2413793103449,325.080808080808C837.630690467405,324.57894218881006,837.6451716015606,317.1921015822337,855.0344827586207,316.6902356902357C872.4237939156809,316.18836979823766,873.1531736773215,318.02292520225126,889.8275862068966,322.98316498316495C906.5019987364716,327.94340476407865,907.4736353885872,334.73093134661667,924.6206896551724,337.66666666666663C941.7677439217576,337.66666666666663,942.578941522853,337.66666666666663,959.4137931034484,335.56902356902356C976.2486446840436,331.1840738372781,977.1386719183727,322.1517634107601,994.2068965517242,318.78787878787875C1011.2751211850757,315.4239941649974,1029,320.8855218855218,1029,320.8855218855218
rect矩形信息
<rect x="20" y="130" width="1214" height="207.66666666666666" r="0" rx="0" ry="0" fill="#ff0000" stroke="none" opacity="0" style="-webkit-tap-highlight-color: rgba(0, 0, 0, 0); opacity: 0;"></rect>
右侧的标尺
<img id="trendYimg" style="position: absolute; left: 1129px; top: 137px;" src="/Interface/IndexShow/getYaxis/?res=LWJGOX8PdwAFFgE0KQAdVw0vF3V%2FM3YeDksXYzRaAAgMIxskLFd1H3ECJT5TRxQSCVcwORcsBXwhJwsqE0ZkJn4DNzxaBhl8QzVBKzZtIXAKMn4gdmt7BRoZCTVPIDsWMz9IMAVbfxkhEAwiOFZ4FAkQUi9CBwEhVnAlBBofDhsuMF4ZeVRlIjBCOkA4YC0MAWI8DC02MlRhEjIUPlhCNAAjEy4SAkYROmIWHQFrFXYqCRxORSggK0onYRsiDRQWJCgCU380LDcU&res2=bPEXSTR7htcHoxnlmzMmLgSCjXbEFWycT6i3M5VHoHwRcxsgjpzZGvy544.271bP&max_y=6bGp&min_y=6bo&axis=Pb5">
分析
计算
源码分析
类webdriver chrome操作
-- coding: utf-8 --import datetimeimport osimport reimport timeimport urllibfrom time import sleepimport requestsfrom selenium import webdriverfrom selenium.webdriver.common.by import Byfrom selenium.webdriver.support import expectedconditions as ECfrom selenium.webdriver.support.ui import WebDriverWaitfrom utils.regocr import getverifyclass BDCrawler(object): def __init(self, userName, pwd, chromedriver, data_dir): ''' 初始化 driver :param userName:账户 :param pwd: 密码 :param chromedriver:webdirver ''' os.environ["webdriver.chrome.driver"] = chromedriver options = webdriver.ChromeOptions() options.add_argument('—user-data-dir=' + data_dir) # 设置成用户自己的数据目录 # option.add_argument('—user-agent=iphone') #修改浏览器的User-Agent来伪装你的浏览器访问手机m站 # option.add_extension('d:\crx\AdBlock_v2.17.crx') # 自己下载的crx路径 self.driver = webdriver.Chrome(executable_path=chromedriver, chrome_options=options) # self.driver = webdriver.Chrome(chromedriver) self.userName = str(userName) self.pwd = pwd def login(self): ''' 判断是否需要登录 :return: ''' try: if self.driver.find_elements_by_class_name('compInfo'): return True WebDriverWait(self.driver, 30).until( EC.presence_of_element_located((By.NAME, 'userName')) ) userName = self.driver.find_element_by_name("userName") userName.send_keys(self.userName) password = self.driver.find_element_by_name("password") password.send_keys(self.pwd) submit = self.driver.find_element_by_xpath('//[@class="pass-form-item pass-form-item-submit"]') submit.submit() print '请检查是否有验证码,手动输入' sleep(10) while self.driver.find_element_by_xpath('//[@class="pass-verifyCode"]'): try: if self.driver.find_element_by_xpath( '//[@class="pass-success pass-success-verifyCode" and @style="display: block; visibility: visible; opacity: 1;"]'): submit = self.driver.find_element_by_xpath('//[@class="pass-form-item pass-form-item-submit"]') submit.submit() break except: pass sleep(6) return True except: return False pass else: return False def getTimeSpan(self): try: self.driver.implicitly_wait(20) # seconds # 获取时间范围 time_span = self.driver.find_elements_by_class_name('compInfo')[4].text from_day = time_span.split()[0] to_day = time_span.split()[-1] from_daytime = datetime.datetime.strptime(from_day, "%Y-%m-%d").date() to_daytime = datetime.datetime.strptime(to_day, "%Y-%m-%d").date() alldays = (to_daytime - from_daytime).days self.driver.implicitly_wait(5) # seconds return from_daytime, alldays except: return -1 def downloadImageFile(self, keyword, imgUrl): local_filename = '../photos/' + keyword + time.strftime('%Y%m%d') + '.png' print "Download Image File=", localfilename r = requests.get(imgUrl, cookies=self.getCookieJson(), stream=True) # here we need to set stream = True parameter with open(localfilename, 'wb') as f: for chunk in r.itercontent(chunksize=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() f.close() return local_filename def getCookieJson(self): cookie_jar = {} for cookie in self.driver.get_cookies(): name = cookie['name'] value = cookie['value'] cookie_jar[name] = value return cookie_jar def webcrawler(self, keyword): ''' 百度指数控制函数 :return: ''' self.driver.get('http://index.baidu.com/?tpl=trend&word=' + urllib.quote(keyword.encode('gb2312'))) # self.driver.maximize_window() self.login() try: # 判断form表单ajax加载完成标记:id属性 WebDriverWait(self.driver, 20).until( EC.presence_of_element_located((By.ID, 'trend')) ) except: pass try: for i in range(1, 4): if self.driver.find_element_by_class_name('toLoading').get_attribute('style') != u'display: none;': break sleep(5) except: pass for i in range(1, 4): from_daytime, alldays = self.getTimeSpan() if alldays < 0: self.driver.refresh() else: break sleep(2) for i in range(1, 4): try: # 获取所有的纵坐标的点 svg_data = re.search(r'<path fill="none" stroke="#3ec7f5"(.?)" stroke-width="2" stroke-opacity="1"', self.driver.page_source).group(1) break except: self.driver.refresh() sleep(2) pass points = [] for line in re.split('C', svg_data): tmp_point = re.split(',', line)[-1] points.append(tmp_point) # 判断时间天数差 与 points个数 是否一致 if len(points) != alldays + 1: return # self.driver.save_screenshot('aa.png') # 截取当前网页 CaptchaUrl = self.driver.find_element_by_id('trendYimg').get_attribute('src') # 定位坐标尺度 pic_path = self.downloadImageFile(urllib.quote(keyword.encode('gb2312')), CaptchaUrl) reg_txt = getverify(pic_path) MaxValue = float(reg_txt.split()[0].replace(',', '')) MinValue = float(reg_txt.split()[-1].replace(',', '')) kedu = (MaxValue - MinValue) / (len(reg_txt.split()) - 1) indexValue = [] for index, point in enumerate(points): day = from_daytime + datetime.timedelta(days=index) Xvalue = MaxValue - (float(point) - 130) (MaxValue - (MinValue - kedu)) / 207.6666666 indexValue.append({'day': day, 'value': Xvalue}) return indexValuedef test_ocr(): pic_path = r'..\photos\love_2016_08_31.png' reg_txt = getverify(pic_path)if __name == '__main': test_ocr()
验证码识别:定义reg_ocr.py
-- coding: utf-8 --from PIL import Imagefrom PIL import ImageEnhancefrom pytesseract import def getverify(name): ''' 图片识别模块 :param name:图片path :return: ''' # 打开图片 im = Image.open(name) # 使用ImageEnhance可以增强图片的识别率 enhancer = ImageEnhance.Contrast(im) image_enhancer = enhancer.enhance(4) # 放大图像 方便识别 im_orig = image_enhancer.resize((image_enhancer.size[0] 2, imageenhancer.size[1] * 2), Image.BILINEAR) # 识别 text = imagetostring(imorig) im.close() im_orig.close() # 识别对吗 text = text.strip() return text# 验证码识别,此程序只能识别数据验证码if __name == '__main': getverify('trendYimg.png') # 注意这里的图片要和此文件在同一个目录,要不就传绝对路径也行
主函数
-- coding: utf-8 --import base64import yamlfrom BDSpider.bdspider import BDCrawlerdef main(): stream = file('../config/setting.yaml', 'r') # 'document.yaml' contains a single YAML document. pyconfig = yaml.load(stream) bdcrawler = BD_Crawler(pyconfig['Authentication']['name'], base64.b64decode(pyconfig['Authentication']['pwd']), pyconfig['chromedriver'], pyconfig['data_dir']) values = bd_crawler.webcrawler(u'爱情') print values print 'end'if __name == '__main': main()
定义配置文件 setting.yaml
Authentication:
name: piaosanlang@qq.com
pwd: XXXXXXXXXXX
chromedriver: C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe
data_dir: C:\Users\enlong\AppData\Local\Google\Chrome\User Data11