<< 《搜索引擎评分指南》阅读心得 | 首页 | Fix certificate problem in HTTPS - Real's Java How-to >>

爬取百度网盘用户分享 | Guodong

上面3个连接请求必须带上 ("Referer", "https://yun.baidu.com/share/home?uk=23432432#category/type=0"),uk多少无所谓,否则请求不到json数据,
获取用户订阅和获取用户粉丝每次请求一次休眠2s的话可以无限制请求,对ip没要求,获取用户分享超坑,一个ip只能请求10次,并且休眠也没用.
因为没有那么多ip,我就去研究手机版的用户分享,手机版获取用户分享可以一次性连续请求60次,60次后必须休眠35s左右在继续请求就可以,不会像pc版那样必须换ip,
但是手机版只能请求网页源码,然后用正则进行匹配.

下面上源码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
private Logger log = LoggerFactory.getLogger(FollowStartIndex.class);
public void startIndex() {
//无限循环
while (true) {
//从数据库获取可用uk,可用首先从某个粉丝超多的用户入手,获取他粉丝的uk,存入数据库
Avaiuk avaiuk = Avaiuk.dao.findFirst("select * from avaiuk where flag=0 limit 1");
//更新数据库,标记该uk已经被用户爬过
avaiuk.set("flag", 1).update();
getFllow(avaiuk.getLong("uk"), 0);
}
}

static String url = "http://yun.baidu.com/pcloud/friend/getfollowlist?query_uk=%s&limit=24&start=%s&bdstoken=e6f1efec456b92778e70c55ba5d81c3d&channel=chunlei&clienttype=0&web=1&logid=MTQ3NDA3NDg5NzU4NDAuMzQxNDQyMDY2MjA5NDA4NjU=";
static Map map = new HashMap();

static {
map.put("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36");
map.put("X-Requested-With", "XMLHttpRequest");
map.put("Accept", "application/json, text/javascript, */*; q=0.01");
map.put("Referer", "https://yun.baidu.com/share/home?uk=325913312#category/type=0");
map.put("Accept-Language", "zh-CN");
}

//获取订阅用户
public void getFllow(long uk, int start, boolean index) {
log.info("进来getFllow,uk:{},start:{}", uk, start);
boolean exitUK = false;
try {
exitUK = Redis.use().exists(uk);
} catch (Exception e) {
exitUK = true;
}
if (!exitUK) {
Redis.use().set(uk, "");
if (index) {
indexResource(uk);
}
recFollow(uk,start,true);
} else {
if (start > 0) {//分页订阅
recFollow(uk,start,false);
} else {
log.warn("uk is index:{}", uk);
}
}


}
public void recFollow(long uk,int start,boolean goPage){
try {
Thread.sleep(4000);
} catch (InterruptedException e) {
e.printStackTrace();
}
String real_url = String.format(url, uk, start);
ResponseBody body = OkhttpUtil.syncGet(real_url, Headers.of(map));
if (body != null) {
try {
Follow follow = JSON.parseObject(body.string(), Follow.class);

if (follow.getErrno() == 0) {
List<Follow.FollowListBean> followListBeen = follow.getFollow_list();
if (followListBeen != null && followListBeen.size() > 0) {
log.info("不为空:{}", follow);
for (Follow.FollowListBean bean : followListBeen) {
int follow_count = bean.getFollow_count();
int shareCount=bean.getPubshare_count();
if (follow_count > 0) {
if (shareCount > 0) {
getFllow(bean.getFollow_uk(), 0, true);
} else {
getFllow(bean.getFollow_uk(), 0, false);
}
}
}
if(goPage){
int total_count = follow.getTotal_count();
//log.warn("分页页数:{}",total_count);
//分页
int total_page = (total_count - 1) / 24 + 1;

for (int i = 1; i < total_page; i++) {
getFllow(uk, i * 24,false);
}

}
} else {
log.info("为空:{}", follow);
}
}

} catch (IOException e) {
e.printStackTrace();
}
}
}

long uinfoId = 0;
long nullStart = System.currentTimeMillis();

public void indexResource(long uk) {
while (true) {
String url = "http://pan.baidu.com/wap/share/home?uk=%s&start=%s&adapt=pc&fr=ftw";
String real_url = String.format(url, uk, 0);

YunData yunData = DataUtil.getData(real_url);

if (yunData != null) {
log.info("{}", yunData.toString());
int share_count = yunData.getUinfo().getPubshare_count();
if (share_count > 0) {
Uinfo uinfo = new Uinfo();
uinfo.set("uname", yunData.getUinfo().getUname()).set("avatar_url", yunData.getUinfo().getAvatar_url()).set("uk", uk).set("incache", 1).save();
uinfoId = uinfo.getLong("id");
List<Records> recordses = yunData.getFeedata().getRecords();
for (Records record : recordses) {
new ShareData().set("title", record.getTitle()).set("shareid", record.getShareid()).set("uinfo_id", uinfoId).save();
}

}
int totalPage = (share_count - 1) / 20 + 1;

int start = 0;
if (totalPage > 1) {
for (int i = 1; i < totalPage; i++) {
start = i * 20;
real_url = String.format(url, uk, start);
yunData = DataUtil.getData(real_url);
if (yunData != null) {
log.info("{}", yunData.toString());
List<Records> recordses = yunData.getFeedata().getRecords();
for (Records record : recordses) {
//用户分享的数据存入数据库
new ShareData().set("title", record.getTitle()).set("shareid", record.getShareid()).set("uinfo_id", uinfoId).save();
}
} else {
i--;
log.warn("uk:{},msg:{}", uk, yunData);
long temp = nullStart;
nullStart = System.currentTimeMillis();
if ((nullStart - temp) < 1500) {
try {
Thread.sleep(60000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}

}

}

}
break;
} else {
log.warn("uk:{},msg:{}", uk, yunData);
long temp = nullStart;
nullStart = System.currentTimeMillis();
//在1500毫秒内2次请求到的数据都为null时,此时可能被百度限制了,休眠一段时间就可以恢复
if ((nullStart - temp) < 1500) {
try {
Thread.sleep(60000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}


}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
public class DataUtil {
public static YunData getData(String url) {
//自己对okhttp的封装
ResponseBody body = OkhttpUtil.syncGet(url);
String html = null;
if (body == null) {
return null;
}
try {
html = body.string();
} catch (IOException e) {
return null;
}
Pattern pattern = Pattern.compile("window.yunData = (.*})");
Matcher matcher = pattern.matcher(html);
String json = null;
while (matcher.find()) {
json = matcher.group(1);
}
if (json == null) {
return null;
}
//fastjson
YunData yunData = JSON.parseObject(json, YunData.class);
return yunData;
}
}

YunData自己获取下json数据,就能创建出来,代码就不放了.

这么爬取速度很快,3台服务器一天就爬取了100多万.
Golang版本

阅读全文……

标签 : , ,



发表评论 发送引用通报