Mailing List Archive


[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [tlug] Re: tlug] Security question with grep/e...



Jim,

Just for fun and reference purposes.... 

Here is a C program I wrote years ago that uses the 
PCRE (Perl Compatible Regular Expressions) package from
http://www.pcre.org/  to strip HTML tags out of a file. 

Obviously, this is much more painful than the equivalent in Python
or Perl. But it does work. And it's portable (once you
have installed the PCRE libs...). Maybe you could modify it to do
what you want. There is now a C++ wrapper which makes the
library easier to use, too. 

Regards,
Jake
#include <stdio.h>
#include <malloc.h>
#include <assert.h>

#include <pcre.h>

#define VECSIZE 60 
#define BUFLEN 10000

int debugMode = 0;

void 
print_re_error(int pairs) 
{
    switch (pairs) {
    case PCRE_ERROR_NOMATCH:
        fprintf(stderr, "No match.\n");
        break;
    case PCRE_ERROR_NULL:
        fprintf(stderr, "One or more NULL input params.\n");
        break;
    case PCRE_ERROR_BADOPTION:
        fprintf(stderr, "Invalid option specified.\n");
        break;
    case PCRE_ERROR_BADMAGIC:
        fprintf(stderr, "Bad magic.\n");
        break;
    case PCRE_ERROR_UNKNOWN_NODE:
        fprintf(stderr, "Unknown node.\n");
        break;
    case PCRE_ERROR_NOMEMORY:
        fprintf(stderr, "Out of memory.\n");
        break;
    default:
        fprintf(stderr, "Unknown error %d.\n", pairs);
    }
}

pcre*
re_compile(const char* re_str, int options) 
{
    pcre* re = NULL;
    const char *errptr = NULL;
    int erroffset = 0;

#ifdef RE_DEBUG
    printf("re_compile> re_str: %s\n", re_str);
#endif
    re = pcre_compile(re_str, options, &errptr, &erroffset, NULL);
    if (re == NULL) {
        fprintf(stderr, "pcre_compile: error at offset %d: %s\n", erroffset, errptr);
        return ((pcre*)NULL);
    }
#ifdef RE_DEBUG
    printf("re_compile> pcre_compile succeeded\n");
#endif
    return (re); 
}

int 
strip_html(char* dest, const char* src, size_t src_len, 
           pcre *re, pcre_extra *extra, int debug);

int 
main (int argc, char *argv[])
{
    pcre_extra *extra = NULL;
    const char* study_error = NULL;
    pcre *html_re = NULL;

    const char* html_re_str =  "<(?:[^>'\"]*|(['\"]).*?\\1)*>";
    int rc;

    char source[BUFLEN]; 
    char dest[BUFLEN]; 

    html_re = re_compile(html_re_str, 0);
    if (html_re == NULL) {
        exit (3);
    }
    extra = pcre_study(html_re, 0, &study_error);
    if (study_error != NULL) {
        fprintf(stderr, "pcre_study: %s\n", study_error);
    }

    while ( (fgets(source, BUFLEN, stdin)) != NULL) { 
        rc = strip_html(dest, source, strlen(source), 
                        html_re, extra, debugMode);
        if (rc != 0) {
            fprintf(stderr, "Problem parsing string %s\n", source);
            continue;
        }
        printf("%s", dest);
    } 

    pcre_free(extra);
    pcre_free(html_re);

    exit (0);
}

/* Strip HTML tags from source, copying to dest buffer */
int
strip_html(char* dest, const char* src, size_t src_len, 
           pcre *re, pcre_extra *extra, int debug)
{
    int ovector[VECSIZE];  /* List of matches */
    int pairs = 0;         /* Number of matches */
    int start_offset = 0; /* Point in src string to start searching */
    int flags = 0; /* Regex matching flags */
    char* dest_cur = dest; /* Pointer to current location in dest buffer */

#ifdef DEBUG
    int i; /* Utility counter */
    char buf[BUFLEN]; /* buffer used for debugging */
#endif

    assert(re);

#ifdef DEBUG
        fprintf(stderr, "strip_html> parsing string %s\n", src); 
#endif

    if (!src && src[0]) {
#ifdef DEBUG
        fprintf(stderr, "strip_html> Null input string\n"); 
#endif
        return (0);
    }

    while (1) {
        pairs = pcre_exec(re, extra, src, src_len, start_offset, 
                          flags, ovector, VECSIZE);  
        if (pairs < 0) {
            if (pairs == PCRE_ERROR_NOMATCH) {
#ifdef DEBUG
                fprintf(stderr, 
                        "strip_html> No match for string '%s'.\n", 
                        src + start_offset);
#endif

                /* No HTML found in remainder of string, 
                   just copy input to output */
                strcpy(dest_cur, src + start_offset);
                return (0);
            }
            else {
                if (debug) {
                    fprintf(stderr, 
                        "strip_html: Problem running regex for string %s: ", 
                        src);
                    print_re_error(pairs);
                }
                return (1);
            }
        }
        else if (pairs == 0) {
            /* This should not happen */
#ifdef DEBUG
                fprintf(stderr, 
                    "strip_html> More than %d matches for string %s\n", 
                    VECSIZE / 3, src);
#endif
            pairs = VECSIZE / 3;
        }
        else {
            /* This should always be 1 for this regex */
#ifdef DEBUG
            fprintf(stderr, "strip_html> Matched %d pair(s)\n", pairs); 
#endif
        }

#ifdef DEBUG
        fprintf(stderr, "strip_html> ovector[0]: %d\n", ovector[0]);
        fprintf(stderr, "strip_html> ovector[1]: %d\n", ovector[1]);

        for (i = 0; i < pairs; i++) {
            int len;
            fprintf(stderr, "strip_html> pair: %d\n", i);
            len = pcre_copy_substring(src, ovector, pairs, i, 
                                      buf, sizeof(buf));
            if (len < 0) {
                fprintf(stderr, 
                        "strip_html> Problem getting substring %d: %d\n", i, len);
            }
            else {
                fprintf(stderr, 
                        "strip_html> Matched string %d: %s\n", i, buf);
            }
        }
#endif

        /* Copy part before the match */
        memcpy(dest_cur, src + start_offset, ovector[0] - start_offset); 

        /* Update current location in destination buffer */
        dest_cur += ovector[0] - start_offset;

#ifdef DEBUG
        fprintf(stderr, "dest: %s\n", dest);
#endif

        /* Update offset to point after data */
        start_offset = ovector[1];
    }
  
    return (0);
}


Home | Main Index | Thread Index

Home Page Mailing List Linux and Japan TLUG Members Links