Log   |   Assignments   |   Source   |   Discussion   |   Feedback   |   About Me  |

This page lists the second version (v0.2) of the HTML Entity Search program.


Hash Table Implementation
#include <stdio.h>
#include <string.h>
#include <malloc.h>

#ifndef HASH_TABLE_DEFINED
 #define HASH_TABLE_DEFINED
int HASH_SLOT_COUNT;

struct hashSlot* hashTable;

struct hashSlot 
{
	int chainLength;
	struct chainNode *chainHead, *chainTail;
};

struct chainNode
{
	struct chainNode *nextLink, *prevLink;
	char* chainVal;
};

void initHashTable (int numberOfSlots )
{
	HASH_SLOT_COUNT = numberOfSlots;
	hashTable = (struct hashSlot*) malloc (sizeof(struct hashSlot) * numberOfSlots);
	int i;
	for (i=0; i<numberOfSlots; i++)
	{
		hashTable[i].chainLength = 0;
		hashTable[i].chainHead = NULL;
		hashTable[i].chainTail = NULL;
	}
}

int hashFunction (char* inString)
{
	int i=0, retVal=0;
	if (strlen(inString) > 9)
	{
		retVal = -1;
	}
	else
	{
		for (i=0; i<strlen(inString); i++)
		{
			retVal += inString[i];
		}
		retVal %= HASH_SLOT_COUNT;
	}
	return retVal;
}

int insertElement (char* inString)
{
	struct chainNode *newLink  = malloc(sizeof(struct chainNode));
	struct chainNode *tempLink;
	struct chainNode *prevLink = NULL;
	int slotID = hashFunction (inString);
	if (slotID < 0) 
	{
		printf ("String too long!\n");
		return -1;
	}
	tempLink = hashTable[slotID].chainHead;
	while (tempLink != NULL)
	{
		prevLink = tempLink;
		tempLink = tempLink->nextLink;
	}
	if (prevLink == NULL)
		// Chain was empty
	{
		hashTable[slotID].chainHead = newLink;
		hashTable[slotID].chainTail = newLink;
	}
	else
	{
		prevLink->nextLink = newLink;
		newLink->nextLink = NULL;
		newLink->prevLink = prevLink;
		hashTable[slotID].chainTail = newLink;
	}
	newLink->chainVal = (char*) malloc (sizeof(char) * 10);
	strcpy (newLink->chainVal, inString);
	newLink->chainVal[strlen(inString)] = '\0';
	hashTable[slotID].chainLength++;
	return 0;
}

int searchElement (char* searchString)
{
	int matchPos=0;
	int slotID = hashFunction (searchString);
	if (slotID < 0)
		return -1;
	struct chainNode* tempLink;
	tempLink = hashTable[slotID].chainHead;

	while (tempLink != NULL)
	{
		if (strcmp (tempLink->chainVal, searchString) == 0 )
		{
			return matchPos;
			break;
		}
		matchPos++;
		tempLink = tempLink->nextLink;
	}
	return -1;
}

void printTable ()
{
	int i, j=0;
	struct chainNode *tempNode;
	for (i=0; i<HASH_SLOT_COUNT; i++)
	{
		printf ("Slot: %d #Elements: %d\n", i, hashTable[i].chainLength);
		tempNode =  hashTable[i].chainHead;
		printf ("\t");
		while (tempNode != NULL)
		{
			printf (" %s ", tempNode->chainVal);
			tempNode = tempNode->nextLink;
			j++;
		}
		printf ("\n");
	}
	printf ("Total number of elements: %d", j);
}

/*
int main (void)
{
	char tempString[10];
	// This creates a table with 100 slots
	initHashTable (10);
	strcpy (tempString, "a\0");
	insertElement (tempString);
	strcpy (tempString, "b\0");
	insertElement (tempString);
	strcpy (tempString, "c\0");
	insertElement (tempString);
	strcpy (tempString, "d\0");
	insertElement (tempString);
	strcpy (tempString, "e\0");
	insertElement (tempString);
	strcpy (tempString, "f\0");
	insertElement (tempString);
	strcpy (tempString, "g\0");
	insertElement (tempString);
	strcpy (tempString, "h\0");
	insertElement (tempString);
	strcpy (tempString, "i\0");
	insertElement (tempString);
	strcpy (tempString, "j\0");
	insertElement (tempString);
	strcpy (tempString, "k\0");
	insertElement (tempString);
	strcpy (tempString, "\0\0\0\0\0\0\0\0\0\0\0\0\0"); 
	int i, j;
	
	printf ("Filling...\n");
	//for (i=0; i<1000; i++)
	//{
		for (j=65; j<75; j++)
		{
			tempString[0] = j;
			insertElement (tempString);
		}
	//}

	insertElement ("a\0");
	insertElement ("b\0");
	insertElement ("c\0");
	insertElement ("d\0");
	insertElement ("e\0");
//	printTable ();	
	printf ("Searching for 'a'. Result %d\n", searchElement ("a\0"));
	printf ("Searching for 'b'. Result %d\n", searchElement ("b\0"));
	printf ("Searching for 'c'. Result %d\n", searchElement ("c\0"));
	printf ("Searching for 'd'. Result %d\n", searchElement ("d\0"));
	printf ("Searching for 'e'. Result %d\n", searchElement ("e\0"));
	printf ("Searching for 'f'. Result %d\n", searchElement ("f\0"));
	return 0;
}*/
#endif



/*
 * Advanced Computer Architecture
 * Assignment - 2
 * Read an HTML file and list out all the entities present in the file
 *
 * Author: Kurian John (CS10M035)
 *
 * Revision History
 * ----------------
 * 2011-02-09 - v0.2
 * 	Reports only valid entities by searching in
 * 	a hash table 
 * 2011-02-08 - v0.1
 * 	Modified regular expression to match a \n and/or \r
 * 	after the & and before other characters
 * 2011-02-01 - v0.0
 * 	New program
 */

#include <stdio.h>
#include <string.h>
#include <pcre.h>
#include "hashTable.c"

#define OVECCOUNT 120 

void populateHashTable ();
int  printMatches(char* subject, char *pattern, int lineCount);
int trimSpacesAndNewline (char* inString, char* outString);

int main(int argc, char **argv)
{
	char *pattern = (char *) malloc (sizeof(char) * 100);
	char *readString = (char *) malloc (sizeof(char) * 250);
	char *fName = (char *) malloc (sizeof(char) * 200);
	FILE* inFile;

	int fileIsDone = 0, i=0, charCount=0, lineCount=1;
	int matchCount=0, matchCountTemp=0;
	char currChar, readChar='a', tChar[2];

	populateHashTable ();
	//printTable ();

	while (readChar != 'x')
	{
		printf ("HTML Entity Locator - This program lists out all entities in an HTML document\n");
		printf ("Please enter the input file name: ");
		scanf ("%s", fName);

		inFile = fopen (fName, "r");

		while (!inFile)
		{
			printf ("Error opening input file %s!\n", fName);
			printf ("Please enter a valid file name (0 to exit): ");
			scanf ("%s", fName);
			if (!strcmp (fName, "0")) return 0;
			inFile = fopen (fName, "r");
		}

		// The regular expression that matches all entities
		strcpy (pattern, "&([\n\r]*)([a-z|A-Z]+|#[0-9]+);");
		int waitingForEntity = 0, charCount=0;
		while (!fileIsDone)
		{
			// Read the file line by line 
			// If the line is longer than 98 characters, break into pieces
			while ((charCount < 70) || (waitingForEntity))
			{
				currChar = fgetc (inFile);
				if ( (currChar == EOF) )
				{
					if (waitingForEntity)
						printf ("Possible invalid entity in file. Found & but not ;!\n");
					fileIsDone = 1;
					break;
				}
				readString[charCount] = currChar;
				if (currChar == '&')
				{
					waitingForEntity = 1;
				}

				if ( (currChar == ';') && waitingForEntity)
				{
					waitingForEntity = 0;
				}
				charCount++;
				if (charCount > 150)
				{
					printf ("Possible invalid entity in file. Found & but not ;!\n");
					break;
				}
			}

			readString[charCount] = '\0';
			matchCountTemp = printMatches (readString, pattern, lineCount);	
			if (matchCountTemp == -1)
			{
				printf ("Error encountered during regex search!\n");
				break;
			}
			matchCount += matchCountTemp;
			charCount=0;
			readString[0]='\0';
			waitingForEntity = 0;
		}
		printf ("Total number of entities found: %d\n", matchCount);

		printf ("Enter 'x' to exit or 'p' to process another file...");

		fileIsDone = 0;
		matchCount = 0;

		scanf ("%s", tChar);
		readChar = tChar[0];
		printf ("\n");
	}

	return 0;
}

void populateHashTable ()
{
	FILE* entFile = NULL;
	entFile = fopen ("entities.in", "r");
	if (entFile == NULL)
	{
		printf ("Error opening entities reference file!\n");
		exit (-1);
	}
	char readLine[20];
	int numEntities;
	fscanf (entFile, "%d", &numEntities);
	fgets (readLine, 20, entFile);
	if (numEntities > 100)
		initHashTable (numEntities/10);
	else
		initHashTable (10);

	while (!feof (entFile))
	{
		fscanf (entFile, "%s", readLine);
		insertElement (readLine);	
	}

	return;
}

int  printMatches(char* subject, char *pattern, int lineCount)
{
	pcre *re;
	const char *error;
	unsigned char *name_table;
	int erroffset;
	int find_all;
	int namecount;
	int name_entry_size;
	int ovector[OVECCOUNT];
	int subject_length;
	int rc, i;
	int matchCount=0;

	subject_length = (int)strlen(subject);

	// Compile the regex
	re = pcre_compile(
			pattern,              /* the pattern */
			0,                    /* default options */
			&error,               /* for error message */
			&erroffset,           /* for error offset */
			NULL);                /* use default character tables */

	/* Compilation failed: print the error message and exit */
	if (re == NULL)
	{
		printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);
		return -1;
	}

	// Compile was success
	rc = pcre_exec(
			re,                   /* the compiled pattern */
			NULL,                 /* no extra data - we didn't study the pattern */
			subject,              /* the subject string */
			subject_length,       /* the length of the subject */
			0,                    /* start at offset 0 in the subject */
			0,                    /* default options */
			ovector,              /* output vector for substring information */
			OVECCOUNT);           /* number of elements in the output vector */

	if (rc < 0)
		// Matching failed
	{
		if (rc != PCRE_ERROR_NOMATCH)
		{
			printf("Matching error %d\n", rc);
			return -1;
		}
		pcre_free(re);     /* Release memory used for the compiled pattern */
		return 0;
	}

	/* Match succeded */


	/* The output vector wasn't big enough */
	if (rc == 0)
	{
		rc = OVECCOUNT/3;
		printf("ovector only has room for %d captured substrings\n", rc - 1);
	}
	else
	{
		// Print the first match
		char *substring_start = subject + ovector[0];
		int substring_length = ovector[1] - ovector[0	];
		//		printf("%.*s   ", substring_length, substring_start);
		char *foundVal = (char*) malloc(sizeof(char) * 250);
		char *searchVal = (char*) malloc(sizeof(char) * 250) ;
		strncpy (foundVal, substring_start, substring_length);
		foundVal[substring_length] = '\0';
		trimSpacesAndNewline (foundVal, searchVal);
		printf ("Found: %s ", searchVal);
		if (searchElement (searchVal) != -1)
		{
			printf (" (Valid) \n");
		}
		else
		{
			printf (" (Invalid) \n");
		}

		matchCount++;
	}

	/* Loop for second and subsequent matches */
	for (;;)
	{
		int options = 0;                 /* Normally no options */
		int start_offset = ovector[1];   /* Start at end of previous match */

		/* If the previous match was for an empty string, we are finished if we are
		 * at the end of the subject. Otherwise, arrange to run another match at the
		 * same point to see if a non-empty match can be found. */

		if (ovector[0] == ovector[1])
		{
			if (ovector[0] == subject_length) break;
			options = PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED;
		}

		/* Run the next matching operation */

		rc = pcre_exec(
				re,                   /* the compiled pattern */
				NULL,                 /* no extra data - we didn't study the pattern */
				subject,              /* the subject string */
				subject_length,       /* the length of the subject */
				start_offset,         /* starting offset in the subject */
				options,              /* options */
				ovector,              /* output vector for substring information */
				OVECCOUNT);           /* number of elements in the output vector */

		/* This time, a result of NOMATCH isn't an error. If the value in "options"
		 * is zero, it just means we have found all possible matches, so the loop ends.
		 * Otherwise, it means we have failed to find a non-empty-string match at a
		 * point where there was a previous empty-string match. In this case, we do what
		 * Perl does: advance the matching position by one, and continue. We do this by
		 * setting the "end of previous match" offset, because that is picked up at the
		 * top of the loop as the point at which to start again. */

		if (rc == PCRE_ERROR_NOMATCH)
		{
			if (options == 0) break;
			ovector[1] = start_offset + 1;
			continue;    /* Go round the loop again */
		}

		/* Other matching errors are not recoverable. */

		if (rc < 0)
		{
			printf("Matching error %d\n", rc);
			pcre_free(re);    /* Release memory used for the compiled pattern */
			return -1;
		}

		/* Match succeded */


		/* The match succeeded, but the output vector wasn't big enough. */
		if (rc == 0)
		{
			rc = OVECCOUNT/3;
			printf("ovector only has room for %d captured substrings\n", rc - 1);
		}
		else
		{
			char *substring_start = subject + ovector[0];
			int substring_length = ovector[1] - ovector[0];
			//	printf(" %.*s  ", substring_length, substring_start);
			char *foundVal = (char*) malloc(sizeof(char) * 250);

			char *searchVal = (char*) malloc(sizeof(char) * 250) ;
			strncpy (foundVal, substring_start, substring_length);
			trimSpacesAndNewline (foundVal, searchVal);
			printf ("Found: %s ", searchVal);
			if (searchElement (searchVal) != -1)
			{
				printf (" (Valid) \n");
			}
			else
			{
				printf (" (Invalid) \n");
			}

			matchCount++;
		}
	}      /* End of loop to find second and subsequent matches */

	pcre_free(re);       /* Release memory used for the compiled pattern */
	return matchCount;
}


int trimSpacesAndNewline (char* inString, char* outString)
{
	int i, charCount=0;
	char* tempString;
	if (sizeof(inString) > 0 )
	{
		tempString = (char*) malloc (sizeof(inString));
	}
	tempString[0] = '\0';
	for (i=0; i<strlen (inString); i++)
	{
		if ( (inString [i] == ' ') || (inString[i] == '\n') || (inString[i] == '\r') )
		{
			//Skip
		}
		else
		{
			strncat (tempString, inString+i, 1);
			charCount++;
		}
	}
	tempString[charCount] = '\0';
	memcpy (outString, tempString, charCount+1);
	return charCount;
}