<font face=courier size=2>
void testSpider(void)
{
// Spider a site and collect outbound links.
// A C++ web crawler can be created by using the CkSpider class
// to crawl a site. The domains of the outbound links can be
// added to a list of "not-yet-spidered" domains where those
// in turn can be spidered by CkSpider. There is great flexibility
// in limiting the number of pages crawled, and avoiding pages
// with URLs matching "avoid patterns". The C++ reference
// will soon be available, but one can study the reference doc
// for the methods/properties of the Chilkat Spider ActiveX
// at <a href=\"http://www.chilkatsoft.com/refdoc/xSpiderRef.html\">Spider ActiveX</a>
// The method/properties for the C++ class are the same. Refer to
// the CkSpider.h header file for specifics.
// The CkSpider class is included in the "Chilkat Visual C++ Library", which is the 2nd
// download listed on this <a href=\"http://www.chilkatsoft.com/downloads.asp\">downloads page</a>
// Almost forgot… of course it is robots.txt compliant…
CkSpider spider;
spider.Initialize("www.navidir.com");
spider.AddUnspidered("http://www.navidir.com/");
// Set the cache directory.
// (Run this program twice, and you will see how the 2nd run fetchs from cache.)
spider.put_CacheDir("c:/ChilkatSpider/");
spider.put_FetchFromCache(false); // Fetch from the cache if possible
spider.put_UpdateCache(false); // Update the cache with pages that are fetched.
CkString html;
CkString lastUrl;
CkString lastTitle;
int count = 0;
while (spider.get_NumUnspidered() > 0)
{
// Download the next unspidered URL.
// In this example, we are not doing anything with the HTML content
// of each page spidered.
spider.CrawlNext(html);
// Log the URLs and HTML title tags of each URL fetched.
spider.get_LastUrl(lastUrl);
spider.get_LastHtmlTitle(lastTitle);
printf("%s\n%s\n\n",lastUrl.getString(),lastTitle.getString());
}
// Now list the URLs we spidered, and the outbound links accumulated.
int i;
CkString url;
printf("Spidered URLs:\n");
for (i=0; i<spider.get_NumSpidered(); i++)
{
spider.GetSpideredUrl(i,url);
printf("\t%s\n",url.getString());
}
printf("Outbound Links:\n");
for (i=0; i<spider.get_NumOutboundLinks(); i++)
{
spider.GetOutboundLink(i,url);
printf("\t%s\n",url.getString());
}
return;
</font>