Log   |   Assignments   |   Source   |   Discussion   |   Feedback   |   About Me  |

This page lists the second version (v0.1) of the HTML Entity Search program.

/*
 * Advanced Computer Architecture
 * Assignment - 2
 * Read an HTML file and list out all the entities present in the file
 *
 * Author: Kurian John (CS10M035)
 *
 * Revision History
 * ----------------
 * 2011-02-08 - v0.1
 * 	Modified regular expression to match a \n and/or \r
 * 	after the & and before other characters
 * 2011-02-01 - v0.0
 * 	New program
 */

#include <stdio.h>
#include <string.h>
#include <pcre.h>

#define OVECCOUNT 120 


int  printMatches(char* subject, char *pattern, int lineCount);

int main(int argc, char **argv)
{
	char *pattern = (char *) malloc (sizeof(char) * 100);
	char *readString = (char *) malloc (sizeof(char) * 100);
	char *fName = (char *) malloc (sizeof(char) * 100);
	FILE* inFile;

	int fileIsDone = 0, i=0, charCount=0, lineCount=1;
	int matchCount=0, matchCountTemp=0;
	char currChar, readChar='a', tChar[2];

	while (readChar != 'x')
	{
		printf ("HTML Entity Locator - This program lists out all entities in an HTML document\n");
		printf ("Please enter the input file name: ");
		scanf ("%s", fName);

		inFile = fopen (fName, "r");

		while (!inFile)
		{
			printf ("Error opening input file %s!\n", fName);
			printf ("Please enter a valid file name (0 to exit): ");
			scanf ("%s", fName);
			if (!strcmp (fName, "0")) return 0;
			inFile = fopen (fName, "r");
		}

		// The regular expression that matches all entities
		strcpy (pattern, "&([\n\r]*)([a-z|A-Z]+|#[0-9]+);");
		int waitingForEntity = 0, charCount=0;
		while (!fileIsDone)
		{
			// Read the file line by line 
			// If the line is longer than 98 characters, break into pieces
			while ((charCount < 100) || (waitingForEntity))
			{
				currChar = fgetc (inFile);
				if ( (currChar == EOF) )
				{
					if (waitingForEntity)
						printf ("Possible invalid entity in file. Found & but not ;!\n");
					fileIsDone = 1;
					break;
				}
				readString[charCount] = currChar;
				if (currChar == '&')
				{
					waitingForEntity = 1;
				}

				if ( (currChar == ';') && waitingForEntity)
				{
					waitingForEntity = 0;
				}
				charCount++;
				if (charCount > 150)
				{
					printf ("Possible invalid entity in file. Found & but not ;!\n");
					break;
				}
			}

			readString[charCount] = '\0';
			matchCountTemp = printMatches (readString, pattern, lineCount);	
			if (matchCountTemp == -1)
			{
				printf ("Error encountered during regex search!\n");
				break;
			}
			matchCount += matchCountTemp;
			charCount=0;
			readString[0]='\0';
			waitingForEntity = 0;
		}
		printf ("Total number of entities found: %d\n", matchCount);

		printf ("Enter 'x' to exit or 'p' to process another file...");

		fileIsDone = 0;
		matchCount = 0;

		scanf ("%s", tChar);
		readChar = tChar[0];
		printf ("\n");
	}

	return 0;
}

int  printMatches(char* subject, char *pattern, int lineCount)
{
	pcre *re;
	const char *error;
	unsigned char *name_table;
	int erroffset;
	int find_all;
	int namecount;
	int name_entry_size;
	int ovector[OVECCOUNT];
	int subject_length;
	int rc, i;
	int matchCount=0;

	subject_length = (int)strlen(subject);

	// Compile the regex
	re = pcre_compile(
			pattern,              /* the pattern */
			0,                    /* default options */
			&error,               /* for error message */
			&erroffset,           /* for error offset */
			NULL);                /* use default character tables */

	/* Compilation failed: print the error message and exit */
	if (re == NULL)
	{
		printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);
		return -1;
	}

	// Compile was success
	rc = pcre_exec(
			re,                   /* the compiled pattern */
			NULL,                 /* no extra data - we didn't study the pattern */
			subject,              /* the subject string */
			subject_length,       /* the length of the subject */
			0,                    /* start at offset 0 in the subject */
			0,                    /* default options */
			ovector,              /* output vector for substring information */
			OVECCOUNT);           /* number of elements in the output vector */

	if (rc < 0)
		// Matching failed
	{
		if (rc != PCRE_ERROR_NOMATCH)
		{
			printf("Matching error %d\n", rc);
			return -1;
		}
		pcre_free(re);     /* Release memory used for the compiled pattern */
		return 0;
	}

	/* Match succeded */

	printf("Line %d: ", lineCount);

	/* The output vector wasn't big enough */
	if (rc == 0)
	{
		rc = OVECCOUNT/3;
		printf("ovector only has room for %d captured substrings\n", rc - 1);
	}
	else
	{
		// Print the first match
		char *substring_start = subject + ovector[0];
		int substring_length = ovector[1] - ovector[0];
		printf("%.*s\n", substring_length, substring_start);
		matchCount++;
	}

	/* Loop for second and subsequent matches */
	for (;;)
	{
		int options = 0;                 /* Normally no options */
		int start_offset = ovector[1];   /* Start at end of previous match */

		/* If the previous match was for an empty string, we are finished if we are
		 * at the end of the subject. Otherwise, arrange to run another match at the
		 * same point to see if a non-empty match can be found. */

		if (ovector[0] == ovector[1])
		{
			if (ovector[0] == subject_length) break;
			options = PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED;
		}

		/* Run the next matching operation */

		rc = pcre_exec(
				re,                   /* the compiled pattern */
				NULL,                 /* no extra data - we didn't study the pattern */
				subject,              /* the subject string */
				subject_length,       /* the length of the subject */
				start_offset,         /* starting offset in the subject */
				options,              /* options */
				ovector,              /* output vector for substring information */
				OVECCOUNT);           /* number of elements in the output vector */

		/* This time, a result of NOMATCH isn't an error. If the value in "options"
		 * is zero, it just means we have found all possible matches, so the loop ends.
		 * Otherwise, it means we have failed to find a non-empty-string match at a
		 * point where there was a previous empty-string match. In this case, we do what
		 * Perl does: advance the matching position by one, and continue. We do this by
		 * setting the "end of previous match" offset, because that is picked up at the
		 * top of the loop as the point at which to start again. */

		if (rc == PCRE_ERROR_NOMATCH)
		{
			if (options == 0) break;
			ovector[1] = start_offset + 1;
			continue;    /* Go round the loop again */
		}

		/* Other matching errors are not recoverable. */

		if (rc < 0)
		{
			printf("Matching error %d\n", rc);
			pcre_free(re);    /* Release memory used for the compiled pattern */
			return -1;
		}

		/* Match succeded */

		printf("Line %d: ", lineCount);

		/* The match succeeded, but the output vector wasn't big enough. */
		if (rc == 0)
		{
			rc = OVECCOUNT/3;
			printf("ovector only has room for %d captured substrings\n", rc - 1);
		}
		else
		{
			char *substring_start = subject + ovector[0];
			int substring_length = ovector[1] - ovector[0];
			printf(" %.*s\n", substring_length, substring_start);
			matchCount++;
		}
	}      /* End of loop to find second and subsequent matches */

	pcre_free(re);       /* Release memory used for the compiled pattern */
	return matchCount;
}