Log   |   Assignments   |   Source   |   Discussion   |   Feedback   |   About Me  |

This page lists the third version (v0.3) of the HTML Entity Search program. Please follow this link for the hash table implementation.

/*
 * Advanced Computer Architecture
 * Assignment - 2
 * Read an HTML file and list out all the entities present in the file
 *
 * Author: Kurian John (CS10M035)
 *
 * Revision History
 * ----------------
 * 2011-02-15 - v0.3
 * 	Rewrite - Not using regex - simple character search
 * 	Writes all valid entities to <inFileName>.valid and
 * 	invalid entities to <inFileName>.invalid
 * 2011-02-09 - v0.2
 * 	Reports only valid entities by searching in
 * 	a hash table 
 * 2011-02-08 - v0.1
 * 	Modified regular expression to match a \n and/or \r
 * 	after the & and before other characters
 * 2011-02-01 - v0.0
 * 	New program
 */

#include <stdio.h>
#include <string.h>
#include <pcre.h>
#include "hashTable.h"

void populateHashTable ();
int trimSpacesAndNewline (char* inString, char* outString);

int main(int argc, char **argv)
{
	char *readString = (char *) malloc (sizeof(char) * 20);
	char *searchVal = (char *) malloc (sizeof(char) * 20);
	char *numVal = (char *) malloc (sizeof(char) * 20);
	char *fName = (char *) malloc (sizeof(char) * 200);
	char *invalidFName = (char *) malloc (sizeof(char) * 200);
	char *validFName = (char *) malloc (sizeof(char) * 200);
	FILE* inFile, *invalidFile, *validFile;

	int fileIsDone = 0, matchCount=0, lineCount=1,
	    inEntity=0, entityCharCount=0,
	    entityCount = 0, inNumEntity = 0, entityUnicode = 0,
	    invalidCount = 0;
	char currChar, readChar='a', tChar[2], prevChar;

	populateHashTable ();

	while (readChar != 'x')
	{
		printf ("HTML Entity Locator - This program lists out all entities in an HTML document\n");
		printf ("Please enter the input file name: ");
		scanf ("%s", fName);

		inFile = fopen (fName, "r");

		while (!inFile)
		{
			printf ("Error opening input file %s!\n", fName);
			printf ("Please enter a valid file name (0 to exit): ");
			scanf ("%s", fName);
			if (!strcmp (fName, "0")) return 0;
			inFile = fopen (fName, "r");
		}
		strcpy (invalidFName, fName);
		strcat (invalidFName, ".invalid\0");

		strcpy (validFName, fName);
		strcat (validFName, ".valid\0");

		validFile = fopen (validFName, "w");	
		invalidFile = fopen (invalidFName, "w");	
		while (!feof(inFile))
		{
			currChar = fgetc (inFile);
			if (currChar == '\n')
				lineCount++;
			prevChar = currChar;
			if (currChar == '&')
			{
				if (entityCharCount > 0)
				{
				// If we reach here, it means we saw the & of next entity before
				// closing the earlier one with a ;
					entityCharCount = 0;
					fprintf (invalidFile, "L%d: %s (Next entity found before ; of current) \n", lineCount, readString);
					invalidCount++;
				}
				readString[entityCharCount] = currChar;
				readString[entityCharCount+1] = '\0';
				entityCharCount++;
				inEntity=1;
			}
			else if ( (inEntity) && (currChar == ';') )
			{
				entityCount++;
				readString[entityCharCount] = currChar;
				readString[entityCharCount+1] = '\0';
				trimSpacesAndNewline (readString, searchVal);
				if (inNumEntity)
				{
					// If this is a unicode entity, see if the number
					// is less than FFFF
					strncpy(numVal, searchVal+2, strlen(searchVal)-3);
					numVal[strlen(searchVal)-3] = '\0';
					sscanf (numVal, "%d", &entityUnicode);
					if ( (entityUnicode >= 0) && (entityUnicode <= 65535) )
					{
						fprintf (validFile, "L%d: %s\n", lineCount, searchVal);
					}
					else
					{
						fprintf (invalidFile, "L%d: %s (Code out of Unicode bound) \n", lineCount, searchVal);
						invalidCount++;
					}
				}
				else
				{
					// If ISO-8859-1 entity, search in the hash table to see if it's valid
					if (searchElement (searchVal) != -1)
					{
						fprintf (validFile, "L%d: %s\n", lineCount, searchVal);
					}
					else
					{
						fprintf (invalidFile, "L%d: %s (Not an accepted entity as per ISO-8859-1)\n", lineCount, searchVal);
						invalidCount++;
					}
				}
				readString[0]='\0';
				entityCharCount=0;
				inEntity=0;
				inNumEntity = 0;
			}
			else if ( inEntity && (currChar == '#')  )
			{
				if ((!inNumEntity) && (entityCharCount == 1) )
				{
					// Unicode entities can have one #
					readString[entityCharCount] = currChar;
					readString[entityCharCount+1] = '\0';
					entityCharCount++;
					inNumEntity = 1;
				}
				else
				{

					fprintf (invalidFile, "L%d: %s (Unexpected # found) \n", lineCount, readString);
					entityCount++;
					invalidCount++;
					inNumEntity = 0;
					inEntity=0;
					entityCharCount=0;
					readString[0]='\0';
				}
			}
			else if ( inEntity )
			{
				// The entity body is filled here
				// This is for Unicode entities
				if ((inNumEntity) && (isdigit (currChar) ) )
				{
					readString[entityCharCount] = currChar;
					entityCharCount++;
				}
				// This is for other ISO-8859-1 entities, which have
				// alphabets or numbers or newlines.
				else if   (!inNumEntity && (
							( (currChar >= 0x41) && (currChar <= 0x7A)  ) || 
							isdigit(currChar)  ||
							(currChar == 0x0D) || 
							(currChar == 0x0A) || 
							(currChar == 0x20)  ) )
				{
					// Ignore spaces and carriage returns
					if (!((currChar == 0x0D) ||
								(currChar == 0x0A) ||
								(currChar == 0x20)  ))
					{
						readString[entityCharCount] = currChar;
						readString[entityCharCount+1] = '\0';
						entityCharCount++;
					}
				}
				// If nothing matched, then there was something wrong
				else
				{
					fprintf (invalidFile, "L%d: %s%c (Special character found) \n", lineCount, readString, currChar);
					entityCount++;
					invalidCount++;
					inEntity=0;
					inNumEntity = 0;
					entityCharCount=0;
					readString[0]='\0';
				}
			}	

			// Entities can't be more than 10 characters in length
			if (entityCharCount > 10)
			{
				fprintf (invalidFile, "L%d: %s (Allowed length exceeded) \n", lineCount, readString);
				entityCount++;
				invalidCount++;
				inEntity=0;
				inNumEntity = 0;
				entityCharCount=0;
				readString[0]='\0';
			}	
		}
		printf ("Total number of entities found: %d\n", entityCount);
		printf ("Valid: %d\n", entityCount-invalidCount);
		printf ("Invalid: %d\n", invalidCount);
		printf ("Please see files %s.valid and %s.invalid for the list of entities.\n", fName, fName);

		printf ("Enter 'x' to exit or 'p' to process another file...");

		fclose (inFile);
		fclose (validFile);
		fclose (invalidFile);

		fileIsDone = 0;
		matchCount = 0;
		inEntity=0;
		lineCount=0;
		inNumEntity = 0;
		entityCharCount = 0;
		entityCount = 0;
		invalidCount = 0;
		for (matchCount=0; matchCount<20; matchCount++)
			readString[matchCount]='\0';


		scanf ("%s", tChar);
		readChar = tChar[0];
		printf ("\n");
	}

	return 0;
}

/*
 * Fill the hash table with the entities read from entities.in
 * The defaults table size is 10. If number of entities is 
 * more than 100, the table size is increased to #entities/10 
 */
void populateHashTable ()
{
	FILE* entFile = NULL;
	entFile = fopen ("entities.in", "r");
	if (entFile == NULL)
	{
		printf ("Error opening entities reference file!\n");
		exit (-1);
	}
	char readLine[20];
	int numEntities;
	fscanf (entFile, "%d", &numEntities);
	fgets (readLine, 20, entFile);
	if (numEntities > 100)
		initHashTable (numEntities/10);
	else
		initHashTable (10);

	while (!feof (entFile))
	{
		fscanf (entFile, "%s", readLine);
		insertElement (readLine);	
	}

	return;
}

/*
 * Removes spaces and newline characters in inString and copies
 * result to outString
 */
int trimSpacesAndNewline (char* inString, char* outString)
{
	int i, charCount=0;
	char* tempString;
	if (sizeof(inString) > 0 )
	{
		tempString = (char*) malloc (sizeof(inString));
	}
	tempString[0] = '\0';
	for (i=0; i<strlen (inString); i++)
	{
		if ( (inString [i] == ' ') || (inString[i] == '\n') || (inString[i] == '\r') )
		{
			//Skip
		}
		else
		{
			strncat (tempString, inString+i, 1);
			charCount++;
		}
	}
	tempString[charCount] = '\0';
	memcpy (outString, tempString, charCount+1);
	return charCount;
}
Hash Table Implementation
#include <stdio.h>
#include <string.h>
#include <malloc.h>

#ifndef HASH_TABLE_DEFINED
 #define HASH_TABLE_DEFINED
int HASH_SLOT_COUNT;

struct hashSlot* hashTable;

struct hashSlot 
{
	int chainLength;
	struct chainNode *chainHead, *chainTail;
};

struct chainNode
{
	struct chainNode *nextLink, *prevLink;
	char* chainVal;
};

void initHashTable (int numberOfSlots )
{
	HASH_SLOT_COUNT = numberOfSlots;
	hashTable = (struct hashSlot*) malloc (sizeof(struct hashSlot) * numberOfSlots);
	int i;
	for (i=0; i<numberOfSlots; i++)
	{
		hashTable[i].chainLength = 0;
		hashTable[i].chainHead = NULL;
		hashTable[i].chainTail = NULL;
	}
}

int hashFunction (char* inString)
{
	int i=0, retVal=0;
	if (strlen(inString) > 9)
	{
		retVal = -1;
	}
	else
	{
		for (i=0; i<strlen(inString); i++)
		{
			retVal += inString[i];
		}
		retVal %= HASH_SLOT_COUNT;
	}
	return retVal;
}

int insertElement (char* inString)
{
	struct chainNode *newLink  = malloc(sizeof(struct chainNode));
	struct chainNode *tempLink;
	struct chainNode *prevLink = NULL;
	int slotID = hashFunction (inString);
	if (slotID < 0) 
	{
		printf ("String too long!\n");
		return -1;
	}
	tempLink = hashTable[slotID].chainHead;
	while (tempLink != NULL)
	{
		prevLink = tempLink;
		tempLink = tempLink->nextLink;
	}
	if (prevLink == NULL)
		// Chain was empty
	{
		hashTable[slotID].chainHead = newLink;
		hashTable[slotID].chainTail = newLink;
	}
	else
	{
		prevLink->nextLink = newLink;
		newLink->nextLink = NULL;
		newLink->prevLink = prevLink;
		hashTable[slotID].chainTail = newLink;
	}
	newLink->chainVal = (char*) malloc (sizeof(char) * 10);
	strcpy (newLink->chainVal, inString);
	newLink->chainVal[strlen(inString)] = '\0';
	hashTable[slotID].chainLength++;
	return 0;
}

int searchElement (char* searchString)
{
	int matchPos=0;
	int slotID = hashFunction (searchString);
	if (slotID < 0)
		return -1;
	struct chainNode* tempLink;
	tempLink = hashTable[slotID].chainHead;

	while (tempLink != NULL)
	{
		if (strcmp (tempLink->chainVal, searchString) == 0 )
		{
			return matchPos;
			break;
		}
		matchPos++;
		tempLink = tempLink->nextLink;
	}
	return -1;
}

void printTable ()
{
	int i, j=0;
	struct chainNode *tempNode;
	for (i=0; i<HASH_SLOT_COUNT; i++)
	{
		printf ("Slot: %d #Elements: %d\n", i, hashTable[i].chainLength);
		tempNode =  hashTable[i].chainHead;
		printf ("\t");
		while (tempNode != NULL)
		{
			printf (" %s ", tempNode->chainVal);
			tempNode = tempNode->nextLink;
			j++;
		}
		printf ("\n");
	}
	printf ("Total number of elements: %d", j);
}

/*
int main (void)
{
	char tempString[10];
	// This creates a table with 100 slots
	initHashTable (10);
	strcpy (tempString, "a\0");
	insertElement (tempString);
	strcpy (tempString, "b\0");
	insertElement (tempString);
	strcpy (tempString, "c\0");
	insertElement (tempString);
	strcpy (tempString, "d\0");
	insertElement (tempString);
	strcpy (tempString, "e\0");
	insertElement (tempString);
	strcpy (tempString, "f\0");
	insertElement (tempString);
	strcpy (tempString, "g\0");
	insertElement (tempString);
	strcpy (tempString, "h\0");
	insertElement (tempString);
	strcpy (tempString, "i\0");
	insertElement (tempString);
	strcpy (tempString, "j\0");
	insertElement (tempString);
	strcpy (tempString, "k\0");
	insertElement (tempString);
	strcpy (tempString, "\0\0\0\0\0\0\0\0\0\0\0\0\0"); 
	int i, j;
	
	printf ("Filling...\n");
	//for (i=0; i<1000; i++)
	//{
		for (j=65; j<75; j++)
		{
			tempString[0] = j;
			insertElement (tempString);
		}
	//}

	insertElement ("a\0");
	insertElement ("b\0");
	insertElement ("c\0");
	insertElement ("d\0");
	insertElement ("e\0");
//	printTable ();	
	printf ("Searching for 'a'. Result %d\n", searchElement ("a\0"));
	printf ("Searching for 'b'. Result %d\n", searchElement ("b\0"));
	printf ("Searching for 'c'. Result %d\n", searchElement ("c\0"));
	printf ("Searching for 'd'. Result %d\n", searchElement ("d\0"));
	printf ("Searching for 'e'. Result %d\n", searchElement ("e\0"));
	printf ("Searching for 'f'. Result %d\n", searchElement ("f\0"));
	return 0;
}*/
#endif