implement filtertype keyword and fnmatch-based filtering

as suggested in #212, it seems the majority of people don't understand
that input was expected to be in regex format and people were using
filter lists containing plain hostnames, e.g. `www.google.com`.

apart from that, using fnmatch() for matching is actually a lot less
computationally expensive and allows to use big blacklists without
incurring a huge performance hit.

the config file now understands a new option `FilterType` which can
be one of `bre`, `ere` and `fnmatch`.
The `FilterExtended` option was deprecated in favor of it.
It still works, but will be removed in the release after the next.
This commit is contained in:
rofl0r 2021-05-10 00:28:54 +01:00
parent 26db3f6cc9
commit 235b1c10a7
10 changed files with 117 additions and 59 deletions

View File

@ -267,7 +267,7 @@ domains. This option specifies the location of the file
containing the filter rules, one rule per line.
Rules are specified as POSIX basic regular expressions (BRE), unless
FilterExtended is activated.
another FilterType is specified.
Comment lines start with a `#` character.
Example filter file contents:
@ -287,6 +287,20 @@ Example filter file contents:
# filter any domain that starts with adserver
^adserver
=item B<FilterType>
This option can be set to one of `bre`, `ere`, or `fnmatch`.
If `bre` is set, the rules specified in the filter file are matched
using POSIX basic regular expressions, when set to `ere`, using
POSIX extended regular expressions, and when set to `fnmatch` using
the `fnmatch` function as specified in the manpage `man 3p fnmatch`.
`fnmatch` matching is identical to what's used in the shell to match
filenames, so for example `*.google.com` matches everything that
ends with `.google.com`.
If you don't know what regular expressions are or you're using filter
lists from 3rd party sources, `fnmatch` is probably what you want.
It's also the fastest matching method of the three.
=item B<FilterURLs>
If this boolean option is set to `Yes` or `On`, filtering is
@ -300,6 +314,7 @@ recommended not to use this option.
=item B<FilterExtended>
Deprecated. Use `FilterType ere` instead.
If this boolean option is set to `Yes`, then extended POSIX
regular expressions are used for matching the filter rules.
The default is to use basic POSIX regular expressions.
@ -308,7 +323,11 @@ The default is to use basic POSIX regular expressions.
If this boolean option is set to `Yes`, then the filter rules
are matched in a case sensitive manner. The default is to
match case-insensitively.
match case-insensitively, unfortunately.
If you set this to `Yes`, then your matching will be almost
twice as fast.
This setting affects only `bre` and `ere` FilterTypes, fnmatch
is always case sensitive.
=item B<FilterDefaultDeny>

View File

@ -240,10 +240,9 @@ ViaProxyName "tinyproxy"
#FilterURLs On
#
# FilterExtended: Use POSIX Extended regular expressions rather than
# basic.
# FilterType: Use bre (default), ere, or fnmatch for filtering.
#
#FilterExtended On
#FilterType fnmatch
#
# FilterCaseSensitive: Use case sensitive regular expressions.

View File

@ -34,6 +34,7 @@ config_directive_find (register const char *str, register size_t len)
{"defaulterrorfile", CD_defaulterrorfile},
{"startservers", CD_startservers},
{"filtercasesensitive", CD_filtercasesensitive},
{"filtertype", CD_filtertype},
{"filterurls", CD_filterurls},
{"filter", CD_filter},
{"reversemagic", CD_reversemagic},

View File

@ -51,6 +51,7 @@ filterurls, CD_filterurls
filterextended, CD_filterextended
filterdefaultdeny, CD_filterdefaultdeny
filtercasesensitive, CD_filtercasesensitive
filtertype, CD_filtertype
reversebaseurl, CD_reversebaseurl
reverseonly, CD_reverseonly
reversemagic, CD_reversemagic

View File

@ -33,6 +33,7 @@ CD_errorfile,
CD_addheader,
CD_filter,
CD_filterurls,
CD_filtertype,
CD_filterextended,
CD_filterdefaultdeny,
CD_filtercasesensitive,

View File

@ -135,6 +135,7 @@ static HANDLE_FUNC (handle_filtercasesensitive);
static HANDLE_FUNC (handle_filterdefaultdeny);
static HANDLE_FUNC (handle_filterextended);
static HANDLE_FUNC (handle_filterurls);
static HANDLE_FUNC (handle_filtertype);
#endif
static HANDLE_FUNC (handle_group);
static HANDLE_FUNC (handle_listen);
@ -234,6 +235,7 @@ struct {
STDCONF (filterextended, BOOL, handle_filterextended),
STDCONF (filterdefaultdeny, BOOL, handle_filterdefaultdeny),
STDCONF (filtercasesensitive, BOOL, handle_filtercasesensitive),
STDCONF (filtertype, "(bre|ere|fnmatch)", handle_filtertype),
#endif
#ifdef REVERSE_SUPPORT
/* Reverse proxy arguments */
@ -952,6 +954,11 @@ static HANDLE_FUNC (handle_basicauth)
}
#ifdef FILTER_ENABLE
static void warn_deprecated(const char *arg, unsigned long lineno) {
CP_WARN ("deprecated option %s", arg);
}
static HANDLE_FUNC (handle_filter)
{
return set_string_arg (&conf->filter, line, &match[2]);
@ -959,26 +966,53 @@ static HANDLE_FUNC (handle_filter)
static HANDLE_FUNC (handle_filterurls)
{
return set_bool_arg (&conf->filter_url, line, &match[2]);
conf->filter_opts |=
get_bool_arg (line, &match[2]) * FILTER_OPT_URL;
return 0;
}
static HANDLE_FUNC (handle_filterextended)
{
return set_bool_arg (&conf->filter_extended, line, &match[2]);
warn_deprecated("FilterExtended, use FilterType", lineno);
conf->filter_opts |=
get_bool_arg (line, &match[2]) * FILTER_OPT_TYPE_ERE;
return 0;
}
static HANDLE_FUNC (handle_filterdefaultdeny)
{
assert (match[2].rm_so != -1);
if (get_bool_arg (line, &match[2]))
filter_set_default_policy (FILTER_DEFAULT_DENY);
conf->filter_opts |=
get_bool_arg (line, &match[2]) * FILTER_OPT_DEFAULT_DENY;
return 0;
}
static HANDLE_FUNC (handle_filtercasesensitive)
{
return set_bool_arg (&conf->filter_casesensitive, line, &match[2]);
conf->filter_opts |=
get_bool_arg (line, &match[2]) * FILTER_OPT_CASESENSITIVE;
return 0;
}
static HANDLE_FUNC (handle_filtertype)
{
static const struct { unsigned short flag; char type[8]; }
ftmap[] = {
{FILTER_OPT_TYPE_ERE, "ere"},
{FILTER_OPT_TYPE_BRE, "bre"},
{FILTER_OPT_TYPE_FNMATCH, "fnmatch"},
};
char *type;
unsigned i;
type = get_string_arg(line, &match[2]);
if (!type) return -1;
for(i=0;i<sizeof(ftmap)/sizeof(ftmap[0]);++i)
if(!strcmp(ftmap[i].type, type))
conf->filter_opts |= ftmap[i].flag;
safefree (type);
return 0;
}
#endif

View File

@ -50,9 +50,7 @@ struct config_s {
sblist *listen_addrs;
#ifdef FILTER_ENABLE
char *filter;
unsigned int filter_url; /* boolean */
unsigned int filter_extended; /* boolean */
unsigned int filter_casesensitive; /* boolean */
unsigned int filter_opts; /* enum filter_options */
#endif /* FILTER_ENABLE */
#ifdef XTINYPROXY_ENABLE
unsigned int add_xtinyproxy; /* boolean */

View File

@ -25,6 +25,7 @@
#include "main.h"
#include <regex.h>
#include <fnmatch.h>
#include "filter.h"
#include "heap.h"
#include "log.h"
@ -37,15 +38,17 @@
static int err;
struct filter_list {
regex_t cpatb;
union {
regex_t cpatb;
char *pattern;
} u;
};
static sblist *fl = NULL;
static int already_init = 0;
static filter_policy_t default_policy = FILTER_DEFAULT_ALLOW;
/*
* Initializes a linked list of strings containing hosts/urls to be filtered
* Initializes a list of strings containing hosts/urls to be filtered
*/
void filter_init (void)
{
@ -66,10 +69,8 @@ void filter_init (void)
}
cflags = REG_NEWLINE | REG_NOSUB;
if (config->filter_extended)
cflags |= REG_EXTENDED;
if (!config->filter_casesensitive)
cflags |= REG_ICASE;
cflags |= (REG_EXTENDED * !!(config->filter_opts & FILTER_OPT_TYPE_ERE));
cflags |= (REG_ICASE * !(config->filter_opts & FILTER_OPT_CASESENSITIVE));
while (fgets (buf, FILTER_BUFFER_LEN, fd)) {
++lineno;
@ -107,13 +108,19 @@ void filter_init (void)
if (!fl) fl = sblist_new(sizeof(struct filter_list),
4096/sizeof(struct filter_list));
err = regcomp (&fe.cpatb, s, cflags);
if (err != 0) {
if (err == REG_ESPACE) goto oom;
fprintf (stderr,
"Bad regex in %s: line %d - %s\n",
config->filter, lineno, s);
exit (EX_DATAERR);
if (config->filter_opts & FILTER_OPT_TYPE_FNMATCH) {
fe.u.pattern = safestrdup(s);
if (!fe.u.pattern) goto oom;
} else {
err = regcomp (&fe.u.cpatb, s, cflags);
if (err != 0) {
if (err == REG_ESPACE) goto oom;
fprintf (stderr,
"Bad regex in %s: line %d - %s\n",
config->filter, lineno, s);
exit (EX_DATAERR);
}
}
if (!sblist_add(fl, &fe)) {
oom:;
@ -142,7 +149,10 @@ void filter_destroy (void)
if (fl) {
for (i = 0; i < sblist_getsize(fl); ++i) {
p = sblist_get(fl, i);
regfree (&p->cpatb);
if (config->filter_opts & FILTER_OPT_TYPE_FNMATCH)
safefree(p->u.pattern);
else
regfree (&p->u.cpatb);
}
sblist_free(fl);
}
@ -175,11 +185,14 @@ int filter_run (const char *str)
for (i = 0; i < sblist_getsize(fl); ++i) {
p = sblist_get(fl, i);
result =
regexec (&p->cpatb, str, (size_t) 0, (regmatch_t *) 0, 0);
if (config->filter_opts & FILTER_OPT_TYPE_FNMATCH)
result = fnmatch (p->u.pattern, str, 0);
else
result =
regexec (&p->u.cpatb, str, (size_t) 0, (regmatch_t *) 0, 0);
if (result == 0) {
if (default_policy == FILTER_DEFAULT_ALLOW)
if (!(config->filter_opts & FILTER_OPT_DEFAULT_DENY))
return 1;
else
return 0;
@ -187,16 +200,8 @@ int filter_run (const char *str)
}
COMMON_EXIT:
if (default_policy == FILTER_DEFAULT_ALLOW)
if (!(config->filter_opts & FILTER_OPT_DEFAULT_DENY))
return 0;
else
return 1;
}
/*
* Set the default filtering policy
*/
void filter_set_default_policy (filter_policy_t policy)
{
default_policy = policy;
}

View File

@ -21,16 +21,22 @@
#ifndef _TINYPROXY_FILTER_H_
#define _TINYPROXY_FILTER_H_
typedef enum {
FILTER_DEFAULT_ALLOW,
FILTER_DEFAULT_DENY
} filter_policy_t;
enum filter_options {
FILTER_OPT_CASESENSITIVE = 1 << 0,
FILTER_OPT_URL = 1 << 1,
FILTER_OPT_DEFAULT_DENY = 1 << 2,
FILTER_OPT_TYPE_BRE = 1 << 8,
FILTER_OPT_TYPE_ERE = 1 << 9,
FILTER_OPT_TYPE_FNMATCH = 1 << 10,
};
#define FILTER_TYPE_MASK \
(FILTER_OPT_TYPE_BRE | FILTER_OPT_TYPE_ERE | FILTER_OPT_TYPE_FNMATCH)
extern void filter_init (void);
extern void filter_destroy (void);
extern void filter_reload (void);
extern int filter_run (const char *str);
extern void filter_set_default_policy (filter_policy_t policy);
#endif

View File

@ -471,22 +471,16 @@ BAD_REQUEST_ERROR:
* Filter restricted domains/urls
*/
if (config->filter) {
if (config->filter_url)
ret = filter_run (url);
else
ret = filter_run (request->host);
int fu = config->filter_opts & FILTER_OPT_URL;
ret = filter_run (fu ? url : request->host);
if (ret) {
update_stats (STAT_DENIED);
if (config->filter_url)
log_message (LOG_NOTICE,
"Proxying refused on filtered url \"%s\"",
url);
else
log_message (LOG_NOTICE,
"Proxying refused on filtered domain \"%s\"",
request->host);
log_message (LOG_NOTICE,
"Proxying refused on filtered %s \"%s\"",
fu ? "url" : "domain",
fu ? url : request->host);
indicate_http_error (connptr, 403, "Filtered",
"detail",