// Parser.cpp
// parses html information and assigns it to the appropriate place
// also keeps track of links

#include "parser.h"




// constructor, must pass a reference to a database object, also accepts name of stopword file
Parser::Parser( DataBase &data, string stopFile ) : NO_BODY(20), stops(stopFile) {
		//cout << "\nParser initializing . . . ";
		database = &data;
		readError = 0;
		cLine = description = leftover = "";
		firstrun = false;
		//cout << "Succesful!";
}




// Deconstructor
Parser::~Parser() {
		database = NULL;
		delete database;
}





// returns the next string from the file or "" if there are no more strings
string Parser::nextString() {
	bool finished = false;
	if (cLine.empty()) {			// if there is nothing in cLine, get more stuff
		finished = !getInput();
	}
	if (finished) {					// if getInput() returned that it had nothing left to get, return ""
		close();
		return "";
	}
	else {
		if (firstrun) {				// if tags have & links not yet been extracted, do so now
			removeTags(cLine);
		}
		if (isalpha(cLine[0])) {		// if next char in cLine is in alphabet, look for next white space and return resulting substring as the next word
			int breaker = cLine.find(" ");
			if (breaker == string::npos) breaker = cLine.length() - 1;
			string ret = cLine.substr(0, breaker);
			cLine = cLine.substr(breaker+1, cLine.length() - breaker - 1);
			if (stops.isStopWord(ret)) return nextString();	// if word is stopword, move on to next word
			else return ret;
		}
		else {						// if next char is not in alphabet, delete it and look again
			cLine = cLine.substr(1, cLine.length()-1);
			return nextString();	
		}
	}
}






// returns the next link from the links object
string Parser::nextLink() {
	return links.getNext();
}





// returns description
string Parser::getDescription() {
	return description;
}






// sets links limiting domain
void Parser::setDomain( string name ) {
	convertCase(name); 
	links.assignLimit(name);
}






// opens a new file for reading, returns 1 if succesful
// also gets the description of the file and starts cLine out at the beggining
// of the body
int Parser::open( string name ) {
	
	cLine = "";
	leftover = "";
	//cout << "Opening file . . .";
	char *URL = new char[name.length()+1];						// must change string name to char* for calling openConnection
	name.copy( URL, name.length(), 0 );
	URL[name.length()] = 0;
	readError = connection.openConnection(URL);	// open a new HTML doc
	bool finished = !getInput();			// get a new cLine, finished is true if there is nothing left to get
	
	//cout << "\nSuccesfully Retrieved first input";
	
	int findTitle = 0;
	int findH1 = 0;
	int findBody = 0;
	
	findTitle = cLine.find("<title>", 0);	// try to find various tags in order to get description
	//cout << "\nfind title: " << findTitle;
	bool found = false;			// looking for <h1>, etc...
	while ((findH1 != string::npos) && (!found))  {
		findH1 = cLine.find("<h", findH1+1);
		if ((findH1 != string::npos) && (isdigit(cLine[findH1+2]))) {found = true;}
	}
	//if (findH1 >= 0) cout << "\n cLine: " << cLine.substr(findH1,10) << "\n\n";
	//cin.get();
		
	findBody = cLine.find("<body", 0);	// looking for <body
		
// if first pass to find description is not succesful
// do it 'till we find it or there's nothing left
		
	while ((!finished && (findTitle == string::npos)) && ((findBody == string::npos) && (findH1 == string::npos))) {
			finished = !getInput();
			findTitle = cLine.find("<title>");
			findH1 = cLine.find("<h");
				if (!isdigit(cLine[findH1+2])) {findH1 = string::npos;}
			findBody = cLine.find("<body");
	}
	
	//cout << "\nFound description!";
	
	if (finished) {		// if we didn't find anything, open the file again and take the first 100 characters
		close();
		readError = connection.openConnection(URL);
		extractLinks(cLine);
		description = cLine.substr(0,100);
	}
	
	else {
		if ( findTitle != string::npos) {	// if we found a title
			
			description = cLine.substr(findTitle + 7, (cLine.find("</title>") - findTitle - 7)); // get it as the description
			
			while (( findBody == string::npos) && (!finished)) {  // now look for <body tag
				finished = !getInput();
				findBody = cLine.find("<body");
			}
			
			if (finished) {		// if no <body tag is found, there is an error and nothing can be done w/ document
				cLine = "";
				readError = NO_BODY;
			}
			else {
				int endBody = cLine.find(">", findBody);
				cLine = cLine.substr((endBody+1), (cLine.length() - endBody-1)); // place cLine right after <body tag
			}
			
			//cout << "\nRemoved header info";
			//testData();
			
		}
		
		else if ( findH1 != string::npos ) {	// if we found a <h*> tag
				//cout << "\ndoing a header tag";
				int endTag = cLine.find(">", findH1 + 1);
				//cout << "\n" << endTag << " " << cLine.find("</h");
				//cin.get();
				description = cLine.substr(endTag + 1, (cLine.find("</h", findH1) - endTag - 1));	// get description out of <h*> tag
				
				close();												// reset document
				readError = connection.openConnection(URL);				
				
				finished = !getInput();
				findBody = cLine.find("<body");
				while (( findBody == string::npos) && (!finished)) {	// find <body
					finished = !getInput();
					findBody = cLine.find("<body");
				}
				
				if (finished) {	// if not found, major problem!
					cLine = "";
					readError = NO_BODY;
				}
				else {			// set cLine after <body tag
					int endBody = cLine.find(">", findBody);
					cLine = cLine.substr(endBody + 1, (cLine.length() - endBody - 1));
				}
			}
			
			else {				// finally, if only <body is found, get first 100 chars as description and set cLine after <body tag
				
				int endBody = cLine.find(">", findBody);
				cLine = cLine.substr(endBody +1, (cLine.length() - endBody -1));
				removeTags(cLine);
				description = cLine.substr(0,100);
				
			}
	}
	//cout << "\nDescription stripped cLine:\n	" << cLine;
	//cout << "\nDescription:	" << description;
	//links.testList();
	//cin.get();
	return getReadError();	// return any read errors found
}






// close the connection
void Parser::close() {
	connection.closeConnection();
	cLine = "";
	leftover = "";
		
}






// return readError
int Parser::getReadError() {
	return readError;
}




 

// find all links in passed string and add them to links object
void Parser::extractLinks( string line) {
	int x = line.find("href");
	while ( x < string::npos ) {
		int y = line.find("\"", x);
		int z = line.find("\"", y+1);
		//cout << x << "	" << y << "	" << z << "\n";
		links.add(line.substr(y+1, z-y-1));
		x = line.find("href", z);
	}
	//links.testList();
}






// remove all tags from passed string
// also removes all non-alpha numerical chars
void Parser::removeTags( string &line) {
	firstrun = false;			// sets firstrun to false so we don't run this again
	int x = cLine.find("<");
	while ( x < string::npos ) {
		int y = line.find(">");
		line.replace(x, y-x+1 , " ");
		x = line.find("<");
	}
	for (int x = 0; x < line.length(); x++) {
		if (!isalnum(line[x]))
			{ line[x] = ' '; }
	} 
}






// Get next section of document from getInput, returns 0 if there is nothing left
int Parser::getInput() {
	firstrun = true;					// indicate that we will need to run removeTag again
	char *buf = new char[BUFSIZE+1];
	//cout << "\nAbout to read cLine";
	//cin.get();
	int finished = connection.read_buf(buf);
	//cout << "\n Got cLine!";
	//cin.get();
	cLine = leftover + buf;		// assign cLine to leftover string from last time plus the new input
	//cout << "\nThe original cLine: \n" << cLine;
	// find Leftover loop
	if (finished != 0) {	// if connection is finished, don't do this				
		int lastOpenTag = cLine.find_last_of("<");		
		int lastCloseTag = cLine.find_last_of(">");
		if (lastOpenTag == string::npos) {		// string::npos is too large for easy comparison
			lastOpenTag = -1;					// change it to -1
		}
		if (lastCloseTag == string::npos) {
			lastCloseTag = -1;
		}
		int cutoff;
		if ( lastOpenTag > lastCloseTag) {		// compare last instance of < with > to see if we've cut off the middle of a tag
			cutoff = lastOpenTag;				// if so, we need to cut off at <
		}
		else {
			cutoff = cLine.find_last_of(" ");	// otherwise, we need to cutoff at the last white space
			if (lastCloseTag > cutoff) {cutoff = lastCloseTag+1;}
		}
		leftover = cLine.substr(cutoff, cLine.length() - cutoff);	// set leftover
		cLine = cLine.substr(0, cutoff);		// remove leftover from cLine
	}
	//delete [] buf;
	//cout << "\n\nLeftover: " << leftover;
	extractLinks(cLine);
	// replaceAll(cLine, "\n" , " ");		// strip endl characters from string
	convertCase(cLine);					// convert everything to lowercase to make things easier
	//cout << "\nModified cLine: \n" << cLine;
	//cin.get();
	
	return finished;							// if it is finished, return 0.
}






// convert passed string to lowercase
void Parser::convertCase( string &line ) {
	//cout << "\nCalled ConverCase";
	for (int x = 0; x < line.length(); x++) {
		line[x] = tolower(line[x]);
	}
	//testData();
}






// replaces all instances of second string in first string with the last string
int Parser::replaceAll( string &line, string ostring, string nstring) {
	int count = 0;
	int x = line.find(ostring);
	while ( x < string::npos ) {
		line.replace(x,ostring.length(),nstring);
		x = line.find(ostring, x+1);
		count++;
	}
	return count;
}






// Debugging Method
void Parser::testData() {
	cout << "cLine:\n\n" << cLine << "\n\nLeftover:\n\n" << leftover << "\nPause\n\n";;
	cin.get();
}
