Statistics
| Branch: | Tag: | Revision:

mongoose / examples / simple_crawler / simple_crawler.c @ eaef5bd1

History | View | Annotate | Download (2.35 KB)

1
#include <stdio.h>
2
#include <string.h>
3
#include "mongoose.h"
4
#include "../../../slre/slre.h"
5

    
6
static const char *regex = "href=\"((https?://)[^\\s/'\"<>]+/?[^\\s'\"<>]*)";
7
const int max_depth = 2;
8

    
9
struct userdata {
10
  char *url;
11
  int depth;
12
};
13

    
14
void crawl_page(struct mg_mgr *mgr, const char *url, size_t url_len, int depth);
15
void handle_reply(struct mg_connection *nc, struct http_message *hm);
16

    
17
static void event_handler(struct mg_connection *nc, int event, void *data) {
18
  struct http_message *hm = (struct http_message *) data;
19
  int connect_status;
20

    
21
  switch (event) {
22
    case MG_EV_CONNECT:
23
      connect_status = *(int *) data;
24
      if (connect_status != 0) {
25
        printf("Error while loading page: %s, error: %s\n",
26
               ((struct userdata *) nc->user_data)->url,
27
               strerror(connect_status));
28
      }
29
      break;
30
    case MG_EV_CLOSE:
31
      free(((struct userdata *) nc->user_data)->url);
32
      free(nc->user_data);
33
      break;
34
    case MG_EV_HTTP_REPLY:
35
      handle_reply(nc, hm);
36
      nc->flags |= MG_F_SEND_AND_CLOSE;
37
      break;
38
    default:
39
      break;
40
  }
41
}
42

    
43
int main(void) {
44
  struct mg_mgr mgr;
45

    
46
  mg_mgr_init(&mgr, NULL);
47
  crawl_page(&mgr, "http://www.simpleweb.org/", ~0, 0);
48

    
49
  for (;;) {
50
    mg_mgr_poll(&mgr, 1000);
51
  }
52

    
53
  mg_mgr_free(&mgr);
54

    
55
  return 0;
56
}
57

    
58
void crawl_page(struct mg_mgr *mgr, const char *url, size_t url_len,
59
                int depth) {
60
  struct mg_connection *nc;
61
  struct userdata *data = malloc(sizeof(struct userdata));
62

    
63
  if (url_len == (size_t) ~0) {
64
    url_len = strlen(url);
65
  }
66

    
67
  data->url = strncpy(malloc(url_len + 1), url, url_len);
68
  data->url[url_len] = '\0';
69
  data->depth = depth;
70

    
71
  nc = mg_connect_http(mgr, event_handler, url, NULL, NULL);
72
  nc->user_data = data;
73
}
74

    
75
void handle_reply(struct mg_connection *nc, struct http_message *hm) {
76
  struct userdata *ud = (struct userdata *) nc->user_data;
77
  const char *body = hm->body.p;
78

    
79
  int offset, max_matches = 2, cursor = 0, str_len = strlen(body);
80
  struct slre_cap caps[max_matches];
81

    
82
  printf("Loaded url: %s at depth %d\n", ud->url, ud->depth);
83
  if (ud->depth == max_depth) {
84
    return;
85
  }
86

    
87
  while (cursor < str_len &&
88
         (offset = slre_match(regex, body + cursor, str_len - cursor, caps,
89
                              max_matches, SLRE_IGNORE_CASE)) > 0) {
90
    crawl_page(nc->mgr, caps[0].ptr, caps[0].len, ud->depth + 1);
91
    cursor += offset;
92
  }
93
}