Question:
Do you have an example of how to use your HTML to XML component to convert the HTML page to plain text? Essentially, to remove everything from the XML except the text?
Answer:
Chilkat.Http http = new Chilkat.Http();
http.UnlockComponent("HTTP 30-day trial");
// Make our HTTP client mimic Mozilla Firefox
http.MimicFireFox = true;
// Do not fetch from cache or save to cache.
http.FetchFromCache = false;
http.UpdateCache = false;
Chilkat.HtmlToXml conv = new Chilkat.HtmlToXml();
conv.UnlockComponent("Html-to-Xml 30-day trial");
// Get the HTML for a Yahoo! business news story:
conv.Html = http.QuickGetStr("http://biz.yahoo.com/ap/060130/earns_exxon_mobil.html?.v=5″);
// Convert it to XML and load it into Chilkat XML.
Chilkat.Xml xml = new Chilkat.Xml();
xml.LoadXml(conv.ToXml());
// After converting to XML, the text is located entirely under
// "text" nodes. We don’t want anything under "script" sub-trees,
// so eliminate those by specifying "script" in the 2nd argument.
string pageText = xml.AccumulateTagContent("text","script");
textBox1.Text = pageText;