多线程网页抓取器
检验当时学习的知识 尝试过的练习
二三月在知乎上看了嵩哥 的回答,遂有了尝试对其入门练习进行实现,只做了一丁点的工作感觉就挺吃力,对web,操作系统及C对掌握太次了,碰到点小问题就会卡很久, 索性也稍微看了看http抓包的一些视频资源,网络编程的书籍, 后面忙些别的又搁置了。 现在回头看,嵩哥的总结仍然是十分深刻,或许我该检讨检讨自己的学习方法
有目的的练习,尤其是一个完整的应用问题的解决,是学习的不二法门。
单页面抓取
HTTP1.1的GET请求须带上HOST字段
之前在b站看的盗版😂陶辉
HTTP2.0主要特性是多路复用,HTTP3.0则是在网络层对UDP优化的QUIC协议
#include <stdio.h>
#include <string.h> // memset memcpy
#include <stdlib.h> // free exit
#include <unistd.h> // read write close
#include <sys/socket.h> // socket connect
#include <netinet/in.h> // sockaddr_in sockaddr
#include <netdb.h> // hostent gethostbyname
static void error(const char * msg) {perror(msg); exit(0);}
int main(int argc, char **argv)
{
//const char *domain = strlen(argv[1]) > 0 ? argv[1] : "localhost";
//const int port = atoi(argv[2]) > 0 ? atoi(argv[2]) : 80;
const char *domain = "news.baidu.com";
const int port = 80;
int sock = socket(AF_INET, SOCK_STREAM, 0);
if(sock < 0)
error("Fail to opening socket");
struct hostent *host = gethostbyname(domain);
if(!host)
error("No such host");
struct sockaddr_in serv_addr;
memset(&serv_addr, 0, sizeof(serv_addr));
serv_addr.sin_family = AF_INET;
serv_addr.sin_port = htons(port);
memcpy(&serv_addr.sin_addr.s_addr, host->h_addr, host->h_length);
if(connect(sock, (struct sockaddr *)&serv_addr, sizeof (serv_addr)) < 0)
error("Fail to connect host");
char request[BUFSIZ];
// http/1.1 must have host field
char *head = "GET / HTTP/1.1rn";
sprintf(request, "%sHost: %s", head, domain);
strcat(request, "rnrn");
printf("Request info:n%sn", request);
int bytes;
size_t total = strlen(request);
size_t sent = 0;
do
{
bytes = (int)write(sock, request + sent, total - sent);
if(bytes < 0)
error("Error writing to sock");
if(bytes == 0)
break;
sent += bytes;
} while(sent < total);
char response[BUFSIZ];
memset(response, 0, sizeof(response));
char *document = malloc(1 << 30);
size_t len = 0;
printf("Response: n");
do
{
strcat(document, response);
fprintf(stdout, "%sn", response);
memset(response, 0, sizeof *response);
bytes = (int)recv(sock, response, BUFSIZ, 0);
if(bytes < 0)
error("Error reading from sock");
if(bytes == 0)
break;
len += bytes;
}while(1);
document[len] = '';
FILE *fp = fopen("0000.html", "w");
fprintf(fp, "%s", document);
fclose(fp);
close(sock);
free(document);
printf("Done!n");
getchar();
return 0;
}对GET的字段还不是很熟悉, 搜狐的新闻页抓取下来都是乱码,新浪、腾讯也是额,就百度勉强还行, 除了百度全是ipv6地址。 一些动态加载的页面内容也加载不出来, 后面有机会再整吧,现在感觉还不那重要。
解析URL
逐字符解析感觉有点难度,用正则匹配href属性,regex的通配符还写不明白,除了http(s),有的板块解析为域名后缀,或是冗余的//前缀, 这里用标志位,switch进行分开处理, 这里参考的group match
#include <stdio.h>
#include <string.h> // memset memcpy
#include <stdlib.h> // free exit
#include <regex.h> // regcomp regexec regfree
#include <limits.h>
char *fgetls(FILE *fp){
// pointer locate to file end
fseek(fp, 0L, SEEK_END);
// total length of file.txt
long length = ftell(fp);
// reset pointer for file head
rewind(fp);
char *whole_text = calloc(1, length + 1);
if(1 != fread(whole_text, length, 1, fp)){
free(whole_text);
fprintf(stderr, "fread failed"), exit(1);
}
return whole_text;
}
typedef enum{
U_CHECKED,
U_REDUNTANT_PREFIX,
U_SUFFIX,
U_INGNORE
}U_FLAG;
int url_check(char *url){
char proto[5];
memcpy(proto, url, 4);
proto[5] = '';
if(0 == strcmp(proto, "http")) return U_CHECKED;
if('/' == proto[0]){
if('/' == proto[1])
return U_REDUNTANT_PREFIX;
return U_SUFFIX;
}
return U_INGNORE;
}
int main(int argc, char **argv)
{
FILE *fp = fopen("/Users/wcw/0000.html", "r");
char *source = fgetls(fp);
char * pattern = "href="([a-zA-z0-9%/?=.&-_#]+)"";
size_t maxMatches = INT_MAX;
size_t maxGroups = 2;
regex_t regexCompiled;
regmatch_t groupArray[maxGroups];
if (regcomp(®exCompiled, pattern, REG_EXTENDED))
{
printf("Could not compile regular expression.n");
return 1;
};
char *cursor = source;
for (size_t m = 0; m < maxMatches; m++)
{
if (regexec(®exCompiled, cursor, maxGroups, groupArray, 0))
break; // No more matches
size_t offset = 0;
for (size_t g = 0; g < maxGroups; g++)
{
if (groupArray[g].rm_so == (size_t)-1)
break; // No more groups
if (g == 0)
offset = groupArray[g].rm_eo;
char domain[BUFSIZ] = "news.baidu.com";
char cursorCopy[strlen(cursor) + 1];
strcpy(cursorCopy, cursor);
cursorCopy[groupArray[g].rm_eo] = 0;
char *url = cursorCopy + groupArray[g].rm_so;
if(g == 1){
switch(url_check(url)){
case U_INGNORE: break;
case U_CHECKED: strcpy(domain, url); break;
case U_SUFFIX: strcat(domain, url); break;
case U_REDUNTANT_PREFIX: strncpy(domain, url+2, strlen(url)-2);break;
}
fprintf(stdout, "%u:%sn", m, domain);
}
}
cursor += offset;
}
free(source);
regfree(®exCompiled);
fclose(fp);
return 0;
}
今天整合了下,响应有些卡,需要改改recv循环的退出, 感觉得改改博客页面了
忙别的该项目又拖了好久, 模拟登陆上报疫情的时候,顺便整理了下requests.get, 排了部分bug, 偶尔会有incorrect checksum的malloc bug, 还看不太明白
realloc感觉还是得慎用, regex匹配时会检测到malloc的修改,之前realloc的判定条件可能并不完全准确, socket的写操作不像python中是固定大小, 可能小于指定的接受字节大小, while的recv操作退出条件还是的接受buffer长度为0, 不穿出闭包外的临时内存可以使用alloca, 内嵌的闭包并不影响
现在的2web,tls差不多是标配了,兼容一些无需加密的http, 页面的抓取需要考两种情况,tls开源库选的是openssl,自测也需要[证书]https://www.jianshu.com/p/1de38a3f50f3, 爬虫主要是客户端, 需要的是cert.pem, 对服务端还需要密钥key.pem, 有机会也想自己实现个仅具备基本功能tls, 了解下细节。
http://investors.sohu.com/返回响应完毕后连接未断开,进程一直阻塞, 需要加入一个定时器;
想复用tls下对ssl_write与write函数, 签字不同, 有点难整, 都是bug