python爬虫1

python爬虫(1)

(注:代码使用的IP代理不稳定,运行时极有可能失效,建议取免费IP代理网站上寻找代理IP,替换代码中的IP)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
mport re
import time
import chardet
import requests
import urllib.robotparser
from fake_useragent import UserAgent

def get_headers():
ua=UserAgent()
user_agent=ua.random
headers={'User-Agent':user_agent}
return headers


def get_proxies():
proxies={
"http":"111.43.70.58",
"http": "117.90.1.102",
"http": "111.177.186.120",
"http": "58.210.94.242",
"http": "122.234.206.52"
}
return proxies

#robot检测
def robot_check(robotstxt_url,headers,url):
rp=urllib.robotparser.RobotFileParser()
rp.set_url(robotstxt_url)
rp.read()
result=rp.can_fetch(headers['User-Agent'],url)
return result

def get_data(url,num_retries=3,proxies=None):
try:
data=requests.get(url,headers=headers)
print(data.status_code)
except requests.exceptions.ConnectionError as e:
print("请求错误,url:",url)
print("错误详情",e)
data=None
except:
print("未知错误,url:",url)
data=None


if(data!=None)and(500<=data.status_code<600):
if(num_retries>0):
print("服务器错误,正在重试.......")
time.sleep(1)
num_retries-=1
get_data(url,num_retries,proxies=proxies)
return data

def parse_data(data):
if(data==None):
return None
charset=chardet.detect(data.content)
data.encoding=charset['encoding']
html_text=data.text
t=re.findall('<a>(.*?)</a444>',html_text)
return t

if __name__=='__main__':
headers=get_headers()
proxies=get_proxies()
data=get_data("http://www.baidu.com",num_retries=3,proxies=proxies)
t=parse_data(data)
print(t)