HelenOS sources

root/uspace/lib/posix/src/fnmatch.c

/* [<][>][^][v][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. _coll_elm_get
  2. _coll_elm_char
  3. _coll_elm_match
  4. _coll_elm_between
  5. _get_delimited
  6. _class_compare
  7. _is_in_class
  8. _match_char_class
  9. _next_coll_elm
  10. _match_bracket_expr
  11. _partial_match
  12. _full_match
  13. _casefold
  14. fnmatch
  15. __posix_fnmatch_test

/*
 * Copyright (c) 2011 Jiri Zarevucky
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * - Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 * - Redistributions in binary form must reproduce the above copyright
 *   notice, this list of conditions and the following disclaimer in the
 *   documentation and/or other materials provided with the distribution.
 * - The name of the author may not be used to endorse or promote products
 *   derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/** @addtogroup libposix
 * @{
 */
/** @file Filename-matching.
 */

/*
 * This file contains an implementation of the fnmatch() pattern matching
 * function. There is more code than necessary to account for the possibility
 * of adding POSIX-like locale support to the system in the future. Functions
 * that are only necessary for locale support currently simply use single
 * characters for "collation elements".
 * When (or if) locales are properly implemented, extending this implementation
 * will be fairly straightforward.
 */

#include <stdbool.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>

#include "internal/common.h"
#include <fnmatch.h>

/* Returned by _match... functions. */
#define INVALID_PATTERN -1

/*
 * Type for collating element, simple identity with characters now,
 * but may be extended for better locale support.
 */
typedef int coll_elm_t;

/** Return value indicating that the element in question
 * is not valid in the current locale. (That is, if locales are supported.)
 */
#define COLL_ELM_INVALID -1

/**
 * Get collating element matching a string.
 *
 * @param str String representation of the element.
 * @return Matching collating element or COLL_ELM_INVALID.
 */
static coll_elm_t _coll_elm_get(const char *str)
{
        if (str[0] == '\0' || str[1] != '\0') {
                return COLL_ELM_INVALID;
        }
        return str[0];
}

/**
 * Get collating element matching a single character.
 *
 * @param c Character representation of the element.
 * @return Matching collating element.
 */
static coll_elm_t _coll_elm_char(int c)
{
        return c;
}

/**
 * Match collating element with a beginning of a string.
 *
 * @param elm Collating element to match.
 * @param str String which beginning should match the element.
 * @return 0 if the element doesn't match, or the number of characters matched.
 */
static int _coll_elm_match(coll_elm_t elm, const char *str)
{
        return elm == *str;
}

/**
 * Checks whether a string begins with a collating element in the given range.
 * Ordering depends on the locale (if locales are supported).
 *
 * @param first First element of the range.
 * @param second Last element of the range.
 * @param str String to match.
 * @return 0 if there is no match, or the number of characters matched.
 */
static int _coll_elm_between(coll_elm_t first, coll_elm_t second,
    const char *str)
{
        return *str >= first && *str <= second;
}

/**
 * Read a string delimited by [? and ?].
 *
 * @param pattern Pointer to the string to read from. Its position is moved
 *    to the first character after the closing ].
 * @param seq The character on the inside of brackets.
 * @param buf Read buffer.
 * @param buf_sz Read buffer's size. If the buffer is not large enough for
 *    the entire string, the string is cut with no error indication.
 * @param flags Flags modifying the behavior.
 * @return True on success, false if the pattern is invalid.
 */
static bool _get_delimited(
    const char **pattern, int seq,
    char *buf, size_t buf_sz, int flags)
{
        const bool noescape = (flags & FNM_NOESCAPE) != 0;
        const bool pathname = (flags & FNM_PATHNAME) != 0;

        const char *p = *pattern;
        assert(p[0] == '[' && p[1] == seq /* Caller should ensure this. */);
        p += 2;

        while (true) {
                if (*p == seq && *(p + 1) == ']') {
                        /* String properly ended, return. */
                        *pattern = p + 2;
                        *buf = '\0';
                        return true;
                }
                if (!noescape && *p == '\\') {
                        p++;
                }
                if (*p == '\0') {
                        /* String not ended properly, invalid pattern. */
                        return false;
                }
                if (pathname && *p == '/') {
                        /* Slash in a pathname pattern is invalid. */
                        return false;
                }
                if (buf_sz > 1) {
                        /* Only add to the buffer if there is space. */
                        *buf = *p;
                        buf++;
                        buf_sz--;
                }
                p++;
        }
}

/*
 * CHARACTER CLASSES
 */

#define MAX_CLASS_OR_COLL_LEN 6

struct _char_class {
        const char *name;
        int (*func) (int);
};

/* List of supported character classes. */
static const struct _char_class _char_classes[] = {
        { "alnum", isalnum },
        { "alpha", isalpha },
        { "blank", isblank },
        { "cntrl", iscntrl },
        { "digit", isdigit },
        { "graph", isgraph },
        { "lower", islower },
        { "print", isprint },
        { "punct", ispunct },
        { "space", isspace },
        { "upper", isupper },
        { "xdigit", isxdigit }
};

/**
 * Compare function for binary search in the _char_classes array.
 *
 * @param key Key of the searched element.
 * @param elem Element of _char_classes array.
 * @return Ordering indicator (-1 less than, 0 equal, 1 greater than).
 */
static int _class_compare(const void *key, const void *elem)
{
        const struct _char_class *class = elem;
        return strcmp((const char *) key, class->name);
}

/**
 * Returns whether the given character belongs to the specified character class.
 *
 * @param cname Name of the character class.
 * @param c Character.
 * @return True if the character belongs to the class, false otherwise.
 */
static bool _is_in_class (const char *cname, int c)
{
        /* Search for class in the array of supported character classes. */
        const struct _char_class *class = bsearch(cname, _char_classes,
            sizeof(_char_classes) / sizeof(struct _char_class),
            sizeof(struct _char_class), _class_compare);

        if (class == NULL) {
                /* No such class supported - treat as an empty class. */
                return false;
        } else {
                /* Class matched. */
                return class->func(c);
        }
}

/**
 * Tries to parse an initial part of the pattern as a character class pattern,
 * and if successful, matches the beginning of the given string against the class.
 *
 * @param pattern Pointer to the pattern to match. Must begin with a class
 *    specifier and is repositioned to the first character after the specifier
 *    if successful.
 * @param str String to match.
 * @param flags Flags modifying the behavior (see fnmatch()).
 * @return INVALID_PATTERN if the pattern doesn't start with a valid class
 *    specifier, 0 if the beginning of the matched string doesn't belong
 *    to the class, or positive number of characters matched.
 */
static int _match_char_class(const char **pattern, const char *str, int flags)
{
        char class[MAX_CLASS_OR_COLL_LEN + 1];

        if (!_get_delimited(pattern, ':', class, sizeof(class), flags)) {
                return INVALID_PATTERN;
        }

        return _is_in_class(class, *str);
}

/*
 * END CHARACTER CLASSES
 */

/**
 * Reads the next collating element in the pattern, taking into account
 * locale (if supported) and flags (see fnmatch()).
 *
 * @param pattern Pattern.
 * @param flags Flags given to fnmatch().
 * @return Collating element on success,
 *     or COLL_ELM_INVALID if the pattern is invalid.
 */
static coll_elm_t _next_coll_elm(const char **pattern, int flags)
{
        assert(pattern != NULL);
        assert(*pattern != NULL);
        assert(**pattern != '\0');

        const char *p = *pattern;
        const bool noescape = (flags & FNM_NOESCAPE) != 0;
        const bool pathname = (flags & FNM_PATHNAME) != 0;

        if (*p == '[') {
                if (*(p + 1) == '.') {
                        char buf[MAX_CLASS_OR_COLL_LEN + 1];
                        if (!_get_delimited(pattern, '.', buf, sizeof(buf), flags)) {
                                return COLL_ELM_INVALID;
                        }
                        return _coll_elm_get(buf);
                }

                if (*(p + 1) == '=') {
                        char buf[MAX_CLASS_OR_COLL_LEN + 1];
                        if (!_get_delimited(pattern, '=', buf, sizeof(buf), flags)) {
                                return COLL_ELM_INVALID;
                        }
                        return _coll_elm_get(buf);
                }
        }

        if (!noescape && *p == '\\') {
                p++;
                if (*p == '\0') {
                        *pattern = p;
                        return COLL_ELM_INVALID;
                }
        }
        if (pathname && *p == '/') {
                return COLL_ELM_INVALID;
        }

        *pattern = p + 1;
        return _coll_elm_char(*p);
}

#define _matched(match) { \
                int _match = match; \
                if (_match < 0) { \
                        /* Invalid pattern */ \
                        return _match; \
                } else if (matched == 0 && _match > 0) { \
                        /* First match */ \
                        matched = _match; \
                } \
        }

/**
 * Matches the beginning of the given string against a bracket expression
 * the pattern begins with.
 *
 * @param pattern Pointer to the beginning of a bracket expression in a pattern.
 *     On success, the pointer is moved to the first character after the
 *     bracket expression.
 * @param str Unmatched part of the string.
 * @param flags Flags given to fnmatch().
 * @return INVALID_PATTERN if the pattern is invalid, 0 if there is no match
 *     or the number of matched characters on success.
 */
static int _match_bracket_expr(const char **pattern, const char *str, int flags)
{
        const bool pathname = (flags & FNM_PATHNAME) != 0;
        const bool special_period = (flags & FNM_PERIOD) != 0;
        const char *p = *pattern;
        bool negative = false;
        int matched = 0;

        assert(*p == '[');  /* calling code should ensure this */
        p++;

        if (*str == '\0' || (pathname && *str == '/') ||
            (pathname && special_period && *str == '.' && *(str - 1) == '/')) {
                /*
                 * No bracket expression matches end of string,
                 * slash in pathname match or initial period with FNM_PERIOD
                 * option.
                 */
                return 0;
        }

        if (*p == '^' || *p == '!') {
                negative = true;
                p++;
        }

        if (*p == ']') {
                /* When ']' is first, treat it as a normal character. */
                _matched(*str == ']');
                p++;
        }

        coll_elm_t current_elm = COLL_ELM_INVALID;

        while (*p != ']') {
                if (*p == '-' && *(p + 1) != ']' &&
                    current_elm != COLL_ELM_INVALID) {
                        /* Range expression. */
                        p++;
                        coll_elm_t end_elm = _next_coll_elm(&p, flags);
                        if (end_elm == COLL_ELM_INVALID) {
                                return INVALID_PATTERN;
                        }
                        _matched(_coll_elm_between(current_elm, end_elm, str));
                        continue;
                }

                if (*p == '[' && *(p + 1) == ':') {
                        current_elm = COLL_ELM_INVALID;
                        _matched(_match_char_class(&p, str, flags));
                        continue;
                }

                current_elm = _next_coll_elm(&p, flags);
                if (current_elm == COLL_ELM_INVALID) {
                        return INVALID_PATTERN;
                }
                _matched(_coll_elm_match(current_elm, str));
        }

        /* No error occured - update pattern pointer. */
        *pattern = p + 1;

        if (matched == 0) {
                /* No match found */
                return negative;
        } else {
                /* Matched 'match' characters. */
                return negative ? 0 : matched;
        }
}

/**
 * Matches a portion of the pattern containing no asterisks (*) against
 * the given string.
 *
 * @param pattern Pointer to the unmatched portion of the pattern.
 *     On success, the pointer is moved to the first asterisk, or to the
 *     terminating nul character, whichever occurs first.
 * @param string Pointer to the input string. On success, the pointer is moved
 *     to the first character that wasn't explicitly matched.
 * @param flags Flags given to fnmatch().
 * @return True if the entire subpattern matched. False otherwise.
 */
static bool _partial_match(const char **pattern, const char **string, int flags)
{
        /*
         * Only a single *-delimited subpattern is matched here.
         * So in this function, '*' is understood as the end of pattern.
         */

        const bool pathname = (flags & FNM_PATHNAME) != 0;
        const bool special_period = (flags & FNM_PERIOD) != 0;
        const bool noescape = (flags & FNM_NOESCAPE) != 0;
        const bool leading_dir = (flags & FNM_LEADING_DIR) != 0;

        const char *s = *string;
        const char *p = *pattern;

        while (*p != '*') {
                /* Bracket expression. */
                if (*p == '[') {
                        int matched = _match_bracket_expr(&p, s, flags);
                        if (matched == 0) {
                                /* Doesn't match. */
                                return false;
                        }
                        if (matched != INVALID_PATTERN) {
                                s += matched;
                                continue;
                        }

                        assert(matched == INVALID_PATTERN);
                        /* Fall through to match [ as an ordinary character. */
                }

                /* Wildcard match. */
                if (*p == '?') {
                        if (*s == '\0') {
                                /* No character to match. */
                                return false;
                        }
                        if (pathname && *s == '/') {
                                /* Slash must be matched explicitly. */
                                return false;
                        }
                        if (special_period && pathname &&
                            *s == '.' && *(s - 1) == '/') {
                                /* Initial period must be matched explicitly. */
                                return false;
                        }

                        /* None of the above, match anything else. */
                        p++;
                        s++;
                        continue;
                }

                if (!noescape && *p == '\\') {
                        /* Escaped character. */
                        p++;
                }

                if (*p == '\0') {
                        /*
                         * End of pattern, must match end of string or
                         * an end of subdirectory name (optional).
                         */

                        if (*s == '\0' || (leading_dir && *s == '/')) {
                                break;
                        }

                        return false;
                }

                if (*p == *s) {
                        /* Exact match. */
                        p++;
                        s++;
                        continue;
                }

                /* Nothing matched. */
                return false;
        }

        /* Entire sub-pattern matched. */

        /* postconditions */
        assert(*p == '\0' || *p == '*');
        assert(*p != '\0' || *s == '\0' || (leading_dir && *s == '/'));

        *pattern = p;
        *string = s;
        return true;
}

/**
 * Match string against a pattern.
 *
 * @param pattern Pattern.
 * @param string String to match.
 * @param flags Flags given to fnmatch().
 * @return True if the string matched the pattern, false otherwise.
 */
static bool _full_match(const char *pattern, const char *string, int flags)
{
        const bool pathname = (flags & FNM_PATHNAME) != 0;
        const bool special_period = (flags & FNM_PERIOD) != 0;
        const bool leading_dir = (flags & FNM_LEADING_DIR) != 0;

        if (special_period && *string == '.') {
                /* Initial dot must be matched by an explicit dot in pattern. */
                if (*pattern != '.') {
                        return false;
                }
                pattern++;
                string++;
        }

        if (*pattern != '*') {
                if (!_partial_match(&pattern, &string, flags)) {
                        /* The initial match must succeed. */
                        return false;
                }
        }

        while (*pattern != '\0') {
                assert(*pattern == '*');
                pattern++;

                bool matched = false;

                const char *end;
                if (pathname && special_period &&
                    *string == '.' && *(string - 1) == '/') {
                        end = string;
                } else {
                        end = gnu_strchrnul(string, pathname ? '/' : '\0');
                }

                /* Try to match every possible offset. */
                while (string <= end) {
                        if (_partial_match(&pattern, &string, flags)) {
                                matched = true;
                                break;
                        }
                        string++;
                }

                if (matched) {
                        continue;
                }

                return false;
        }

        return *string == '\0' || (leading_dir && *string == '/');
}

/**
 * Transform the entire string to lowercase.
 *
 * @param s Input string.
 * @return Newly allocated copy of the input string with all uppercase
 *     characters folded to their lowercase variants.
 */
static char *_casefold(const char *s)
{
        assert(s != NULL);
        char *result = strdup(s);
        for (char *i = result; *i != '\0'; ++i) {
                *i = tolower(*i);
        }
        return result;
}

/**
 * Filename pattern matching.
 *
 * @param pattern Pattern to match the string against.
 * @param string Matched string.
 * @param flags Flags altering the matching of special characters
 *     (mainly for dot and slash).
 * @return Zero if the string matches the pattern, FNM_NOMATCH otherwise.
 */
int fnmatch(const char *pattern, const char *string, int flags)
{
        assert(pattern != NULL);
        assert(string != NULL);

        // TODO: don't fold everything in advance, but only when needed

        if ((flags & FNM_CASEFOLD) != 0) {
                /* Just fold the entire pattern and string. */
                pattern = _casefold(pattern);
                string = _casefold(string);
        }

        bool result = _full_match(pattern, string, flags);

        if ((flags & FNM_CASEFOLD) != 0) {
                if (pattern) {
                        free((char *) pattern);
                }
                if (string) {
                        free((char *) string);
                }
        }

        return result ? 0 : FNM_NOMATCH;
}

// FIXME: put the testcases to the app/tester after fnmatch is included into libc

#if 0

#include <stdio.h>

#define fnmatch_test(x) { if (x) printf("SUCCESS: "#x"\n"); else { printf("FAILED: "#x"\n"); fail++; } }
#define match(s1, s2, flags) fnmatch_test(fnmatch(s1, s2, flags) == 0)
#define nomatch(s1, s2, flags) fnmatch_test(fnmatch(s1, s2, flags) == FNM_NOMATCH)

void __posix_fnmatch_test()
{
        int fail = 0;

        static_assert(FNM_PATHNAME == FNM_FILE_NAME);
        match("", "", 0);
        match("*", "hello", 0);
        match("hello", "hello", 0);
        match("hello*", "hello", 0);
        nomatch("hello?", "hello", 0);
        match("*hello", "prdel hello", 0);
        match("he[sl]lo", "hello", 0);
        match("he[sl]lo", "heslo", 0);
        nomatch("he[sl]lo", "heblo", 0);
        nomatch("he[^sl]lo", "hello", 0);
        nomatch("he[^sl]lo", "heslo", 0);
        match("he[^sl]lo", "heblo", 0);
        nomatch("he[!sl]lo", "hello", 0);
        nomatch("he[!sl]lo", "heslo", 0);
        match("he[!sl]lo", "heblo", 0);
        match("al*[c-t]a*vis*ta", "alheimer talir jehovista", 0);
        match("al*[c-t]a*vis*ta", "alfons had jehovista", 0);
        match("[a-ce-z]", "a", 0);
        match("[a-ce-z]", "c", 0);
        nomatch("[a-ce-z]", "d", 0);
        match("[a-ce-z]", "e", 0);
        match("[a-ce-z]", "z", 0);
        nomatch("[^a-ce-z]", "a", 0);
        nomatch("[^a-ce-z]", "c", 0);
        match("[^a-ce-z]", "d", 0);
        nomatch("[^a-ce-z]", "e", 0);
        nomatch("[^a-ce-z]", "z", 0);
        match("helen??", "helenos", 0);
        match("****booo****", "booo", 0);

        match("hello[[:space:]]world", "hello world", 0);
        nomatch("hello[[:alpha:]]world", "hello world", 0);

        match("/hoooo*", "/hooooooo/hooo", 0);
        nomatch("/hoooo*", "/hooooooo/hooo", FNM_PATHNAME);
        nomatch("/hoooo*/", "/hooooooo/hooo", FNM_PATHNAME);
        match("/hoooo*/*", "/hooooooo/hooo", FNM_PATHNAME);
        match("/hoooo*/hooo", "/hooooooo/hooo", FNM_PATHNAME);
        match("/hoooo*", "/hooooooo/hooo", FNM_PATHNAME | FNM_LEADING_DIR);
        nomatch("/hoooo*/", "/hooooooo/hooo", FNM_PATHNAME | FNM_LEADING_DIR);
        nomatch("/hoooo", "/hooooooo/hooo", FNM_LEADING_DIR);
        match("/hooooooo", "/hooooooo/hooo", FNM_LEADING_DIR);

        match("*", "hell", 0);
        match("*?", "hell", 0);
        match("?*?", "hell", 0);
        match("?*??", "hell", 0);
        match("??*??", "hell", 0);
        nomatch("???*??", "hell", 0);

        nomatch("", "hell", 0);
        nomatch("?", "hell", 0);
        nomatch("??", "hell", 0);
        nomatch("???", "hell", 0);
        match("????", "hell", 0);

        match("*", "h.ello", FNM_PERIOD);
        match("*", "h.ello", FNM_PATHNAME | FNM_PERIOD);
        nomatch("*", ".hello", FNM_PERIOD);
        match("h?ello", "h.ello", FNM_PERIOD);
        nomatch("?hello", ".hello", FNM_PERIOD);
        match("/home/user/.*", "/home/user/.hello", FNM_PATHNAME | FNM_PERIOD);
        match("/home/user/*", "/home/user/.hello", FNM_PERIOD);
        nomatch("/home/user/*", "/home/user/.hello", FNM_PATHNAME | FNM_PERIOD);

        nomatch("HeLlO", "hello", 0);
        match("HeLlO", "hello", FNM_CASEFOLD);

        printf("Failed: %d\n", fail);
}

#endif

/** @}
 */

/* [<][>][^][v][top][bottom][index][help] */
HelenOS homepage, sources at GitHub