I'm still new to programming, so I need some help in creating an html parser.
Here's what I have so far (there's a few separate files, I got some parts of the code from someone who is helping me:
My HtmlParser header file (HtmlParser.h):
#ifndef HTML_PARSER_H
#define HTML_PARSER_H
#ifndef FSTREAM
# include
# define FSTREAM
#endif
#ifndef STDEXCEPT
# include
# define STDEXCEPT
#endif
#ifndef MAP
# include
# define MAP
#endif
/** \class HtmlParser
* \brief A parser for HTML documents.
*
class HtmlParser {
unsigned lineno_;
unsigned chunk_mask_;
public:
static const unsigned OPENING_TAG = 1u << 0u;
static const unsigned CLOSING_TAG = 1u << 1u;
static const unsigned MALFORMED_TAG = 1u << 2u;
static const unsigned WORD = 1u << 3u;
static const unsigned PUNCTUATION = 1u << 4u;
static const unsigned COMMENT = 1u << 5u;
static const unsigned WHITESPACE = 1u << 6u;
static const unsigned END_OF_STREAM = 1u << 7u;
static const unsigned UNEXPECTED_END_OF_STREAM = 1u << 8u;
static const unsigned EVERYTHING = 0xFFFF;
class AttributeMap: public std::map {
public:
/** \brief Insert a value into an AttributeMap, replacing any old value.
* \param name The name of the key.
* \param value The value to be associated with "name".
* \return True if the attribute wasn't in the map yet, else false.
*
* The pair (name, value) is stored in the AttributeMap. If
* there is an existing value associated with name, it is
* not inserted.
*/
bool insert(const std::string &name, const std::string &value);
};
struct Chunk {
const unsigned type_;
const std::string text_;
const unsigned lineno_;
AttributeMap attribute_map_;
public:
Chunk(const unsigned type, const std::string &text, const unsigned lineno)
: type_(type), text_(text), lineno_(lineno) { }
};
public:
explicit HtmlParser(std::istream &input, unsigned chunk_mask=EVERYTHING)
: input_(input), lineno_(1), chunk_mask_(chunk_mask) { }
virtual ~HtmlParser() { }
void parse();
virtual void notify(const Chunk &chunk) = 0;
};
#endif // ifndef HTML_PARSER_H
My main.cc file:
#ifndef IOSTREAM
# include
# define IOSTREAM
#endif
#ifndef CSTRING
# include
# define CSTRING
#endif
#ifndef CSTDLIB
# include
# define CSTDLIB
#endif
#ifndef HTML_PARSER_H
# include
#endif
const char *progname = "HtmlParserTest";
class TestParser: public HtmlParser {
public:
explicit TestParser(std::istream &input, unsigned notification_mask)
: HtmlParser(input, notification_mask) { }
virtual void notify(const Chunk &chunk);
};
void PrintChar(std:: ostream &output, const char ch)
{
output << ''';
switch (ch) {
case '\n':
output << '\' << 'n';
break;
case '\t':
output << '\' << 't';
break;
case '\b':
output << '\' << 'b';
break;
case '\r':
output << '\' << 'r';
break;
case '\f':
output << '\' << 'f';
break;
case '\v':
output << '\' << 'v';
break;
case '\':
output << '\' << '\';
break;
case ''':
output << '\' << ''';
break;
case '\a':
output << '\' << 'a';
break;
default:
output << ch;
}
output << ''';
}
void TestParser::notify(const Chunk &chunk)
{
if (chunk.type_ == HtmlParser::OPENING_TAG) {
std::cerr << "Found opening tag on line " << chunk.lineno_ << ": " << chunk.text_ << '\n';
for (HtmlParser::AttributeMap::const_iterator attrib(chunk.attribute_map_.begin());
attrib != chunk.attribute_map_.end(); ++attrib)
std::cerr << '\t' << attrib->first << " = " << attrib->second << '\n';
}
else if (chunk.type_ == HtmlParser::CLOSING_TAG)
std::cerr << "Found closing tag on line " << chunk.lineno_ << ": " << chunk.text_ << '\n';
else if (chunk.type_ == HtmlParser::MALFORMED_TAG)
std::cerr << "Found malformed tag on line " << chunk.lineno_ << ": " << chunk.text_ << '\n';
else if (chunk.type_ == HtmlParser:: PUNCTUATION)
std::cerr << "Found punctuation: " << chunk.text_[0] << '\n';
else if (chunk.type_ == HtmlParser::WORD)
std::cerr << "Found \"word\": " << chunk.text_ << " on line " << chunk.lineno_ << ".\n";
else if (chunk.type_ == HtmlParser::COMMENT)
std::cerr << "Found an HTML comment on line " << chunk.lineno_ << ".\n";
else if (chunk.type_ == HtmlParser::WHITESPACE) {
std::cerr << "Found whitespace ";
PrintChar(std::cerr, chunk.text_[0]);
std::cerr << " on line " << chunk.lineno_ << ".\n";
}
else if (chunk.type_ == HtmlParser::END_OF_STREAM)
std::cerr << "Found end-of-stream on line " << chunk.lineno_ << ".\n";
else if (chunk.type_ == HtmlParser::UNEXPECTED_END_OF_STREAM)
std::cerr << "Found unexpected end-of-stream on line " << chunk.lineno_ << " (" << chunk.text_ << ").\n";
}
void Usage()
{
std::cerr << "usage: " << :: progname << " [--report-opening-tags] [--report-closing-tags]"
<< " [--report-words] [--report-whitespace] [--report-comments] [--report-punctuation]\n";
std::exit(EXIT_FAILURE);
}
int main(int argc, char *argv[])
{
try {
unsigned notification_mask(0);
for (int arg_no = 1; arg_no < argc; ++arg_no) {
if (std::strcmp("--report-opening-tags", argv[arg_no]) == 0)
notification_mask |= HtmlParser::OPENING_TAG;
else if (std::strcmp("--report-closing-tags", argv[arg_no]) == 0)
notification_mask |= HtmlParser::CLOSING_TAG;
else if (std::strcmp("--report-words", argv[arg_no]) == 0)
notification_mask |= HtmlParser::WORD;
else if (std::strcmp("--report-whitespace", argv[arg_no]) == 0)
notification_mask |= HtmlParser::WHITESPACE;
else if (std::strcmp("--report-comments", argv[arg_no]) == 0)
notification_mask |= HtmlParser::COMMENT;
else if (std::strcmp("--report-punctuation", argv[arg_no]) == 0)
notification_mask |= HtmlParser:: PUNCTUATION;
else
Usage();
}
if (notification_mask == 0)
notification_mask = HtmlParser::EVERYTHING;
TestParser test_parser(std::cin, notification_mask);
test_parser.parse();
return EXIT_SUCCESS;
}
catch (const std:: exception &x) {
std::cerr << :: progname << ": caught exception: %s" << x.what() << '\n';
}
}
I'm still working on the HtmlParser class, I know I'll need to add some private member functions and a few more member variables.
Also I'll explain the nested chunk structure a little. Basically when I find an opening tag ( < ) and whatever is inside it ( ie. a href = "http://...." ) I will assign it to one of the variables. lineno_ will be used to determine the line number of where the html tag was found. But I have a question, how would I go about reading what's inside the html tag; should I read each character one at a time?
Here's a basic idea of what I will be doing in the third file I'm working on (HtmlParser.cc):
#include "HtmlParser.h"
void HtmlParser:: parse()
{
...
input.get()
{
...
//found opening tag
Chunk *chunk;
chunk = new Chunk(OPENING_TAG, text, lineno_);
chunk -> attribute_map_.insert(// **(read below) still figuring out what to do here)
if(chunk_mask = &OPENING_TAG)
{
notify(chunk);
delete chunk;
}
}
** (I was thinking of adding two new variables, "name" and "value" so for example if there is an html tag like:
I know everything is kind of messy right now, but can someone lend me some help.
[edited by - -dcx- on December 3, 2002 6:28:50 PM]