
Mailing List Archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [tlug] Re: tlug] Security question with grep/e...
- Date: Wed, 24 Mar 2004 03:35:43 -0800 (PST)
- From: Jake Morrison <jake_morrison@example.com>
- Subject: Re: [tlug] Re: tlug] Security question with grep/e...
Jim,
Just for fun and reference purposes....
Here is a C program I wrote years ago that uses the
PCRE (Perl Compatible Regular Expressions) package from
http://www.pcre.org/ to strip HTML tags out of a file.
Obviously, this is much more painful than the equivalent in Python
or Perl. But it does work. And it's portable (once you
have installed the PCRE libs...). Maybe you could modify it to do
what you want. There is now a C++ wrapper which makes the
library easier to use, too.
Regards,
Jake
#include <stdio.h>
#include <malloc.h>
#include <assert.h>
#include <pcre.h>
#define VECSIZE 60
#define BUFLEN 10000
int debugMode = 0;
void
print_re_error(int pairs)
{
switch (pairs) {
case PCRE_ERROR_NOMATCH:
fprintf(stderr, "No match.\n");
break;
case PCRE_ERROR_NULL:
fprintf(stderr, "One or more NULL input params.\n");
break;
case PCRE_ERROR_BADOPTION:
fprintf(stderr, "Invalid option specified.\n");
break;
case PCRE_ERROR_BADMAGIC:
fprintf(stderr, "Bad magic.\n");
break;
case PCRE_ERROR_UNKNOWN_NODE:
fprintf(stderr, "Unknown node.\n");
break;
case PCRE_ERROR_NOMEMORY:
fprintf(stderr, "Out of memory.\n");
break;
default:
fprintf(stderr, "Unknown error %d.\n", pairs);
}
}
pcre*
re_compile(const char* re_str, int options)
{
pcre* re = NULL;
const char *errptr = NULL;
int erroffset = 0;
#ifdef RE_DEBUG
printf("re_compile> re_str: %s\n", re_str);
#endif
re = pcre_compile(re_str, options, &errptr, &erroffset, NULL);
if (re == NULL) {
fprintf(stderr, "pcre_compile: error at offset %d: %s\n", erroffset, errptr);
return ((pcre*)NULL);
}
#ifdef RE_DEBUG
printf("re_compile> pcre_compile succeeded\n");
#endif
return (re);
}
int
strip_html(char* dest, const char* src, size_t src_len,
pcre *re, pcre_extra *extra, int debug);
int
main (int argc, char *argv[])
{
pcre_extra *extra = NULL;
const char* study_error = NULL;
pcre *html_re = NULL;
const char* html_re_str = "<(?:[^>'\"]*|(['\"]).*?\\1)*>";
int rc;
char source[BUFLEN];
char dest[BUFLEN];
html_re = re_compile(html_re_str, 0);
if (html_re == NULL) {
exit (3);
}
extra = pcre_study(html_re, 0, &study_error);
if (study_error != NULL) {
fprintf(stderr, "pcre_study: %s\n", study_error);
}
while ( (fgets(source, BUFLEN, stdin)) != NULL) {
rc = strip_html(dest, source, strlen(source),
html_re, extra, debugMode);
if (rc != 0) {
fprintf(stderr, "Problem parsing string %s\n", source);
continue;
}
printf("%s", dest);
}
pcre_free(extra);
pcre_free(html_re);
exit (0);
}
/* Strip HTML tags from source, copying to dest buffer */
int
strip_html(char* dest, const char* src, size_t src_len,
pcre *re, pcre_extra *extra, int debug)
{
int ovector[VECSIZE]; /* List of matches */
int pairs = 0; /* Number of matches */
int start_offset = 0; /* Point in src string to start searching */
int flags = 0; /* Regex matching flags */
char* dest_cur = dest; /* Pointer to current location in dest buffer */
#ifdef DEBUG
int i; /* Utility counter */
char buf[BUFLEN]; /* buffer used for debugging */
#endif
assert(re);
#ifdef DEBUG
fprintf(stderr, "strip_html> parsing string %s\n", src);
#endif
if (!src && src[0]) {
#ifdef DEBUG
fprintf(stderr, "strip_html> Null input string\n");
#endif
return (0);
}
while (1) {
pairs = pcre_exec(re, extra, src, src_len, start_offset,
flags, ovector, VECSIZE);
if (pairs < 0) {
if (pairs == PCRE_ERROR_NOMATCH) {
#ifdef DEBUG
fprintf(stderr,
"strip_html> No match for string '%s'.\n",
src + start_offset);
#endif
/* No HTML found in remainder of string,
just copy input to output */
strcpy(dest_cur, src + start_offset);
return (0);
}
else {
if (debug) {
fprintf(stderr,
"strip_html: Problem running regex for string %s: ",
src);
print_re_error(pairs);
}
return (1);
}
}
else if (pairs == 0) {
/* This should not happen */
#ifdef DEBUG
fprintf(stderr,
"strip_html> More than %d matches for string %s\n",
VECSIZE / 3, src);
#endif
pairs = VECSIZE / 3;
}
else {
/* This should always be 1 for this regex */
#ifdef DEBUG
fprintf(stderr, "strip_html> Matched %d pair(s)\n", pairs);
#endif
}
#ifdef DEBUG
fprintf(stderr, "strip_html> ovector[0]: %d\n", ovector[0]);
fprintf(stderr, "strip_html> ovector[1]: %d\n", ovector[1]);
for (i = 0; i < pairs; i++) {
int len;
fprintf(stderr, "strip_html> pair: %d\n", i);
len = pcre_copy_substring(src, ovector, pairs, i,
buf, sizeof(buf));
if (len < 0) {
fprintf(stderr,
"strip_html> Problem getting substring %d: %d\n", i, len);
}
else {
fprintf(stderr,
"strip_html> Matched string %d: %s\n", i, buf);
}
}
#endif
/* Copy part before the match */
memcpy(dest_cur, src + start_offset, ovector[0] - start_offset);
/* Update current location in destination buffer */
dest_cur += ovector[0] - start_offset;
#ifdef DEBUG
fprintf(stderr, "dest: %s\n", dest);
#endif
/* Update offset to point after data */
start_offset = ovector[1];
}
return (0);
}
Home |
Main Index |
Thread Index