Posts
多线程网页抓取器

多线程网页抓取器

检验当时学习的知识 尝试过的练习

二三月在知乎上看了嵩哥 的回答,遂有了尝试对其入门练习进行实现,只做了一丁点的工作感觉就挺吃力,对web,操作系统及C对掌握太次了,碰到点小问题就会卡很久, 索性也稍微看了看http抓包的一些视频资源,网络编程的书籍, 后面忙些别的又搁置了。 现在回头看,嵩哥的总结仍然是十分深刻,或许我该检讨检讨自己的学习方法

有目的的练习,尤其是一个完整的应用问题的解决,是学习的不二法门。

单页面抓取

HTTP1.1GET请求须带上HOST字段 之前在b站看的盗版😂陶辉 HTTP2.0主要特性是多路复用,HTTP3.0则是在网络层对UDP优化的QUIC协议

#include <stdio.h> 
#include <string.h> // memset memcpy
#include <stdlib.h> // free exit
#include <unistd.h> // read write close

#include <sys/socket.h> // socket connect
#include <netinet/in.h> // sockaddr_in sockaddr
#include <netdb.h> // hostent gethostbyname

static void error(const char * msg) {perror(msg); exit(0);}

int main(int argc, char **argv)
{
    //const char *domain = strlen(argv[1]) > 0 ? argv[1] : "localhost";
    //const int port = atoi(argv[2]) > 0 ? atoi(argv[2]) : 80;
  	const char *domain = "news.baidu.com";
   	const int port = 80;

    int sock = socket(AF_INET, SOCK_STREAM, 0);
    if(sock < 0)
        error("Fail to opening socket");
    
    struct hostent *host = gethostbyname(domain);
    if(!host)
        error("No such host");
    
    struct sockaddr_in serv_addr;
    memset(&serv_addr, 0, sizeof(serv_addr));
    serv_addr.sin_family = AF_INET;
    serv_addr.sin_port = htons(port);
    memcpy(&serv_addr.sin_addr.s_addr, host->h_addr, host->h_length);
    
    if(connect(sock, (struct sockaddr *)&serv_addr, sizeof (serv_addr)) < 0)
        error("Fail to connect host");
    
    char request[BUFSIZ];
    // http/1.1 must have host field
    char *head = "GET / HTTP/1.1rn";
    sprintf(request, "%sHost: %s", head, domain);
    strcat(request, "rnrn");
    printf("Request info:n%sn", request);
    
	int bytes;
	size_t total = strlen(request);
	size_t sent = 0;
	do
	{
		bytes = (int)write(sock, request + sent, total - sent);
		if(bytes < 0)
			error("Error writing to sock");
		if(bytes == 0)
			break;
		sent += bytes;
	} while(sent < total);


    char response[BUFSIZ];
	memset(response, 0, sizeof(response));
    char *document = malloc(1 << 30);
    size_t len = 0;
    printf("Response: n");
	do
    {
        strcat(document, response);
        fprintf(stdout, "%sn", response);
        memset(response, 0, sizeof *response);
		bytes = (int)recv(sock, response, BUFSIZ, 0);
		if(bytes < 0)
            error("Error reading from sock");
		if(bytes == 0)
			break;
        len += bytes;
    }while(1);
    document[len] = '';
    

    FILE *fp = fopen("0000.html", "w");
    fprintf(fp, "%s", document);
    fclose(fp);
    close(sock);
    free(document);
    
    printf("Done!n");
    getchar();
    return 0;
}

对GET的字段还不是很熟悉, 搜狐的新闻页抓取下来都是乱码,新浪、腾讯也是额,就百度勉强还行, 除了百度全是ipv6地址。 一些动态加载的页面内容也加载不出来, 后面有机会再整吧,现在感觉还不那重要。

解析URL

逐字符解析感觉有点难度,用正则匹配href属性,regex的通配符还写不明白,除了http(s),有的板块解析为域名后缀,或是冗余的//前缀, 这里用标志位,switch进行分开处理, 这里参考的group match

#include <stdio.h>
#include <string.h> // memset memcpy
#include <stdlib.h> // free exit

#include <regex.h>    // regcomp regexec regfree
#include <limits.h>

char *fgetls(FILE *fp){
    // pointer locate to file end
    fseek(fp, 0L, SEEK_END);
    // total length of file.txt
    long length = ftell(fp);
    // reset pointer for file head
    rewind(fp);
    char *whole_text = calloc(1, length + 1);
    if(1 != fread(whole_text, length, 1, fp)){
        free(whole_text);
        fprintf(stderr, "fread failed"), exit(1);
    }
    return whole_text;
}

typedef enum{
    U_CHECKED,
    U_REDUNTANT_PREFIX,
    U_SUFFIX,
    U_INGNORE
}U_FLAG;

int url_check(char *url){
    char proto[5];
    memcpy(proto, url, 4);
    proto[5] = '';
    
    if(0 == strcmp(proto, "http")) return U_CHECKED;
    if('/' == proto[0]){
        if('/' == proto[1])
            return U_REDUNTANT_PREFIX;
        return U_SUFFIX;
    }
    return U_INGNORE;
}


int main(int argc, char **argv)
{

    FILE *fp = fopen("/Users/wcw/0000.html", "r");
    char *source = fgetls(fp);
    char * pattern = "href="([a-zA-z0-9%/?=.&-_#]+)"";
    
    size_t maxMatches = INT_MAX;
    size_t maxGroups = 2;
    
    regex_t regexCompiled;
    regmatch_t groupArray[maxGroups];
    
    if (regcomp(&regexCompiled, pattern, REG_EXTENDED))
    {
        printf("Could not compile regular expression.n");
        return 1;
    };
    
    
    char *cursor = source;
    for (size_t m = 0; m < maxMatches; m++)
    {
        if (regexec(&regexCompiled, cursor, maxGroups, groupArray, 0))
            break;  // No more matches
        
        
        size_t offset = 0;
        for (size_t g = 0; g < maxGroups; g++)
        {
            if (groupArray[g].rm_so == (size_t)-1)
                break;  // No more groups
            
            if (g == 0)
                offset = groupArray[g].rm_eo;
            
            
            char domain[BUFSIZ] = "news.baidu.com";
            char cursorCopy[strlen(cursor) + 1];
            strcpy(cursorCopy, cursor);
            cursorCopy[groupArray[g].rm_eo] = 0;
            char *url = cursorCopy + groupArray[g].rm_so;
            if(g == 1){
                switch(url_check(url)){
                    case U_INGNORE: break;
                    case U_CHECKED: strcpy(domain, url); break;
                    case U_SUFFIX: strcat(domain, url); break;
                    case U_REDUNTANT_PREFIX: strncpy(domain, url+2, strlen(url)-2);break;
                }
            fprintf(stdout, "%u:%sn", m, domain);
            }
        }
        cursor += offset;
    }
    free(source);
    regfree(&regexCompiled);
    fclose(fp);
    return 0;
}

今天整合了下,响应有些卡,需要改改recv循环的退出, 感觉得改改博客页面了


忙别的该项目又拖了好久, 模拟登陆上报疫情的时候,顺便整理了下requests.get, 排了部分bug, 偶尔会有incorrect checksummalloc bug, 还看不太明白


realloc感觉还是得慎用, regex匹配时会检测到malloc的修改,之前realloc的判定条件可能并不完全准确, socket的写操作不像python中是固定大小, 可能小于指定的接受字节大小, whilerecv操作退出条件还是的接受buffer长度为0, 不穿出闭包外的临时内存可以使用alloca, 内嵌的闭包并不影响


现在的2webtls差不多是标配了,兼容一些无需加密的http, 页面的抓取需要考两种情况,tls开源库选的是openssl,自测也需要[证书]https://www.jianshu.com/p/1de38a3f50f3, 爬虫主要是客户端, 需要的是cert.pem, 对服务端还需要密钥key.pem, 有机会也想自己实现个仅具备基本功能tls, 了解下细节。 http://investors.sohu.com/返回响应完毕后连接未断开,进程一直阻塞, 需要加入一个定时器; 想复用tls下对ssl_writewrite函数, 签字不同, 有点难整, 都是bug

参考

[1] https://github.com/ChenyuGao/Crawler-Parallel