python爬虫1 发表于 2019-03-05 字数统计: | 阅读时长 ≈ python爬虫(1)(注:代码使用的IP代理不稳定,运行时极有可能失效,建议取免费IP代理网站上寻找代理IP,替换代码中的IP) 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768mport reimport timeimport chardetimport requestsimport urllib.robotparserfrom fake_useragent import UserAgentdef get_headers(): ua=UserAgent() user_agent=ua.random headers={'User-Agent':user_agent} return headersdef get_proxies(): proxies={ "http":"111.43.70.58", "http": "117.90.1.102", "http": "111.177.186.120", "http": "58.210.94.242", "http": "122.234.206.52" } return proxies#robot检测def robot_check(robotstxt_url,headers,url): rp=urllib.robotparser.RobotFileParser() rp.set_url(robotstxt_url) rp.read() result=rp.can_fetch(headers['User-Agent'],url) return resultdef get_data(url,num_retries=3,proxies=None): try: data=requests.get(url,headers=headers) print(data.status_code) except requests.exceptions.ConnectionError as e: print("请求错误,url:",url) print("错误详情",e) data=None except: print("未知错误,url:",url) data=None if(data!=None)and(500<=data.status_code<600): if(num_retries>0): print("服务器错误,正在重试.......") time.sleep(1) num_retries-=1 get_data(url,num_retries,proxies=proxies) return datadef parse_data(data): if(data==None): return None charset=chardet.detect(data.content) data.encoding=charset['encoding'] html_text=data.text t=re.findall('<a>(.*?)</a444>',html_text) return tif __name__=='__main__': headers=get_headers() proxies=get_proxies() data=get_data("http://www.baidu.com",num_retries=3,proxies=proxies) t=parse_data(data) print(t)