Mailing List Archive
tlug.jp Mailing List tlug archive tlug Mailing List Archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]Re: [tlug] Re: tlug] Security question with grep/e...
- Date: Wed, 24 Mar 2004 03:35:43 -0800 (PST)
- From: Jake Morrison <jake_morrison@example.com>
- Subject: Re: [tlug] Re: tlug] Security question with grep/e...
Jim, Just for fun and reference purposes.... Here is a C program I wrote years ago that uses the PCRE (Perl Compatible Regular Expressions) package from http://www.pcre.org/ to strip HTML tags out of a file. Obviously, this is much more painful than the equivalent in Python or Perl. But it does work. And it's portable (once you have installed the PCRE libs...). Maybe you could modify it to do what you want. There is now a C++ wrapper which makes the library easier to use, too. Regards, Jake#include <stdio.h> #include <malloc.h> #include <assert.h> #include <pcre.h> #define VECSIZE 60 #define BUFLEN 10000 int debugMode = 0; void print_re_error(int pairs) { switch (pairs) { case PCRE_ERROR_NOMATCH: fprintf(stderr, "No match.\n"); break; case PCRE_ERROR_NULL: fprintf(stderr, "One or more NULL input params.\n"); break; case PCRE_ERROR_BADOPTION: fprintf(stderr, "Invalid option specified.\n"); break; case PCRE_ERROR_BADMAGIC: fprintf(stderr, "Bad magic.\n"); break; case PCRE_ERROR_UNKNOWN_NODE: fprintf(stderr, "Unknown node.\n"); break; case PCRE_ERROR_NOMEMORY: fprintf(stderr, "Out of memory.\n"); break; default: fprintf(stderr, "Unknown error %d.\n", pairs); } } pcre* re_compile(const char* re_str, int options) { pcre* re = NULL; const char *errptr = NULL; int erroffset = 0; #ifdef RE_DEBUG printf("re_compile> re_str: %s\n", re_str); #endif re = pcre_compile(re_str, options, &errptr, &erroffset, NULL); if (re == NULL) { fprintf(stderr, "pcre_compile: error at offset %d: %s\n", erroffset, errptr); return ((pcre*)NULL); } #ifdef RE_DEBUG printf("re_compile> pcre_compile succeeded\n"); #endif return (re); } int strip_html(char* dest, const char* src, size_t src_len, pcre *re, pcre_extra *extra, int debug); int main (int argc, char *argv[]) { pcre_extra *extra = NULL; const char* study_error = NULL; pcre *html_re = NULL; const char* html_re_str = "<(?:[^>'\"]*|(['\"]).*?\\1)*>"; int rc; char source[BUFLEN]; char dest[BUFLEN]; html_re = re_compile(html_re_str, 0); if (html_re == NULL) { exit (3); } extra = pcre_study(html_re, 0, &study_error); if (study_error != NULL) { fprintf(stderr, "pcre_study: %s\n", study_error); } while ( (fgets(source, BUFLEN, stdin)) != NULL) { rc = strip_html(dest, source, strlen(source), html_re, extra, debugMode); if (rc != 0) { fprintf(stderr, "Problem parsing string %s\n", source); continue; } printf("%s", dest); } pcre_free(extra); pcre_free(html_re); exit (0); } /* Strip HTML tags from source, copying to dest buffer */ int strip_html(char* dest, const char* src, size_t src_len, pcre *re, pcre_extra *extra, int debug) { int ovector[VECSIZE]; /* List of matches */ int pairs = 0; /* Number of matches */ int start_offset = 0; /* Point in src string to start searching */ int flags = 0; /* Regex matching flags */ char* dest_cur = dest; /* Pointer to current location in dest buffer */ #ifdef DEBUG int i; /* Utility counter */ char buf[BUFLEN]; /* buffer used for debugging */ #endif assert(re); #ifdef DEBUG fprintf(stderr, "strip_html> parsing string %s\n", src); #endif if (!src && src[0]) { #ifdef DEBUG fprintf(stderr, "strip_html> Null input string\n"); #endif return (0); } while (1) { pairs = pcre_exec(re, extra, src, src_len, start_offset, flags, ovector, VECSIZE); if (pairs < 0) { if (pairs == PCRE_ERROR_NOMATCH) { #ifdef DEBUG fprintf(stderr, "strip_html> No match for string '%s'.\n", src + start_offset); #endif /* No HTML found in remainder of string, just copy input to output */ strcpy(dest_cur, src + start_offset); return (0); } else { if (debug) { fprintf(stderr, "strip_html: Problem running regex for string %s: ", src); print_re_error(pairs); } return (1); } } else if (pairs == 0) { /* This should not happen */ #ifdef DEBUG fprintf(stderr, "strip_html> More than %d matches for string %s\n", VECSIZE / 3, src); #endif pairs = VECSIZE / 3; } else { /* This should always be 1 for this regex */ #ifdef DEBUG fprintf(stderr, "strip_html> Matched %d pair(s)\n", pairs); #endif } #ifdef DEBUG fprintf(stderr, "strip_html> ovector[0]: %d\n", ovector[0]); fprintf(stderr, "strip_html> ovector[1]: %d\n", ovector[1]); for (i = 0; i < pairs; i++) { int len; fprintf(stderr, "strip_html> pair: %d\n", i); len = pcre_copy_substring(src, ovector, pairs, i, buf, sizeof(buf)); if (len < 0) { fprintf(stderr, "strip_html> Problem getting substring %d: %d\n", i, len); } else { fprintf(stderr, "strip_html> Matched string %d: %s\n", i, buf); } } #endif /* Copy part before the match */ memcpy(dest_cur, src + start_offset, ovector[0] - start_offset); /* Update current location in destination buffer */ dest_cur += ovector[0] - start_offset; #ifdef DEBUG fprintf(stderr, "dest: %s\n", dest); #endif /* Update offset to point after data */ start_offset = ovector[1]; } return (0); }
- References:
- [tlug] Re: tlug] Security question with grep/e...
- From: Jim Breen
Home | Main Index | Thread Index
- Prev by Date: Re: [tlug] Tech Meeting
- Next by Date: Re: [tlug] Re: Security question with grep/e...
- Previous by thread: Re: [tlug] Re: tlug] Security question with grep/e...
- Next by thread: Re: [tlug] Re: tlug] Security question with grep/e...
- Index(es):
Home Page Mailing List Linux and Japan TLUG Members Links