Only in xapian-omega-1.0.1_svn8931: 0.9.9.patch diff -ru xapian-omega-1.0.1_svn8931-orig/omega.cc xapian-omega-1.0.1_svn8931/omega.cc --- xapian-omega-1.0.1_svn8931-orig/omega.cc Mon May 14 06:28:31 2007 +++ xapian-omega-1.0.1_svn8931/omega.cc Mon Sep 3 00:00:53 2007 @@ -34,6 +34,8 @@ #include "safefcntl.h" #include "safeunistd.h" +#include "cdb.h" + #include "omega.h" #include "utils.h" #include "cgiparam.h" @@ -171,6 +173,8 @@ const string & v = val->second; if (v == "AND" || v == "and") default_op = Xapian::Query::OP_AND; + } else { + default_op = Xapian::Query::OP_AND; } val = cgi_params.find("FMT"); @@ -207,7 +211,7 @@ if (query_string.empty()) { // collect the prob fields - g = cgi_params.equal_range("P"); + g = cgi_params.equal_range("query"); for (MCI i = g.first; i != g.second; i++) { const string & v = i->second; if (!v.empty()) { @@ -262,6 +266,96 @@ filters += filter_sep; } } + g = cgi_params.equal_range("group"); + if (g.first != g.second) { + vector filter_v; + for (MCI i = g.first; i != g.second; i++) { + string v = i->second; + if (!v.empty()) { + string::iterator i = v.begin(); + while (i != v.end()) { + if (isalnum((unsigned char)*i) || strchr(".-+*", *i)) { + *i = tolower(*i); + i++; + } else { + v.erase(i); + } + } + i = v.begin() + (v.size() - 1); + if (*i == '*' && i != v.begin() && *(i - 1) != '.') { + v.insert(i, '.'); + } + if (v.substr(0, 6) == "gmane.") v.erase(0, 6); + string cdbfile = cdb_dir + "renamed"; + int fd = open(cdbfile.c_str(), O_RDONLY); + if (fd != -1) { + struct cdb cdb; + cdb_init(&cdb, fd); + + if (cdb_find(&cdb, v.data(), v.length()) > 0) { + size_t datalen = cdb_datalen(&cdb); + char *data = reinterpret_cast(malloc(datalen)); + if (data) { + cdb_read(&cdb, data, datalen, cdb_datapos(&cdb)); + v.assign(data, datalen); + free(data); + } + } + cdb_free(&cdb); + close(fd); + } + if (v != "*") { + v = "G" + v; + add_bterm(v); + filter_v.push_back(v); + } + } + } + sort(filter_v.begin(), filter_v.end()); + vector::const_iterator i; + for (i = filter_v.begin(); i != filter_v.end(); ++i) { + filters += *i; + filters += filter_sep; + } + } + g = cgi_params.equal_range("author"); + if (g.first == g.second) g = cgi_params.equal_range("email"); + if (g.first != g.second) { + vector filter_v; + for (MCI i = g.first; i != g.second; i++) { + string v = i->second; + if (!v.empty()) { + string::iterator i = v.begin(); + while (i != v.end()) { + unsigned char ch = *i; + if (isspace(ch)) { + if (i != v.begin()) { + string term = "A"; + term += v.substr(0, i - v.begin()); + add_bterm(term); + filter_v.push_back(term); + } + v.erase(0, i - v.begin() + 1); + i = v.begin(); + } else { + *i = tolower(ch); + i++; + } + } + if (!v.empty()) { + v = "A" + v; + add_bterm(v); + filter_v.push_back(v); + } + } + } + sort(filter_v.begin(), filter_v.end()); + vector::const_iterator i; + for (i = filter_v.begin(); i != filter_v.end(); ++i) { + filters += *i; + filters += filter_sep; + } + } // date range filters val = cgi_params.find("START"); @@ -341,6 +435,18 @@ if (!sort_ascending) { filters += 'r'; } + } + } + val = cgi_params.find("sort"); + if (val != cgi_params.end()) { + if (val->second == "date") { + sort_key = 0; + sort_after = false; + docid_order = Xapian::Enquire::DESCENDING; + } else if (val->second == "revdate") { + sort_key = 0; + sort_after = false; + docid_order = Xapian::Enquire::ASCENDING; } } diff -ru xapian-omega-1.0.1_svn8931-orig/omega.conf xapian-omega-1.0.1_svn8931/omega.conf --- xapian-omega-1.0.1_svn8931-orig/omega.conf Wed Sep 6 12:37:13 2006 +++ xapian-omega-1.0.1_svn8931/omega.conf Fri Jun 22 22:36:29 2007 @@ -1,11 +1,11 @@ # Directory containing Xapian databases: -database_dir /var/lib/omega/data +database_dir /index # Directory containing OmegaScript templates: -template_dir /var/lib/omega/templates +template_dir /home/xapian/templates # Directory to write Omega logs to: -log_dir /var/log/omega +log_dir /mirror/tmp/xapian-logs # Directory containing any cdb files for the $lookup OmegaScript command: -cdb_dir /var/lib/omega/cdb +cdb_dir /mirror/tmp/xapian-cdb diff -ru xapian-omega-1.0.1_svn8931-orig/query.cc xapian-omega-1.0.1_svn8931/query.cc --- xapian-omega-1.0.1_svn8931-orig/query.cc Wed May 30 11:28:37 2007 +++ xapian-omega-1.0.1_svn8931/query.cc Sun Sep 2 23:59:33 2007 @@ -166,6 +166,8 @@ } }; +static multimap filter_map; + static size_t prefix_from_term(string &prefix, const string &term) { @@ -182,6 +184,34 @@ return i - begin; } + if (term[0] == 'G' && !db.term_exists(term)) { + MCI i = cgi_params.find("group"); + if (i != cgi_params.end()) { + error_msg = "Either the group \""; + error_msg += i->second; + error_msg += "\" doesn't exist, or has no indexable articles " + "(usually this means it was only added very recently, or " + "that all articles have \"X-No-Archive: yes\" headers). " + "The group filter can be a single Gmane group " + "name (e.g. gmane.discuss) or a wildcarded hierarchy (e.g. " + "gmane.comp.*). Since all groups start \"gmane.\", you can " + "omit that. Note that arbitrary wildcards and substring " + "matches aren't currently supported."; + } + } else if (term[0] == 'A' && !db.term_exists(term)) { + if (term.find('@') != string::npos) { + if (term.find("@public.gmane.org") != string::npos) { + error_msg = "Filtering on encrypted addresses (@public.gmane.org) isn't supported."; + } else { + error_msg = "No articles found from e-mail address: \""; + error_msg += term.substr(1); + error_msg += "\". The author filter accepts a complete email " + "address, or a real name. Wildcards aren't currently " + "supported."; + } + } + } + prefix = term[0]; return 1; } @@ -236,7 +266,14 @@ } try { - query = qp.parse_query(query_string); + // Nasty hack - replace "group:gmane." with "group:". + string s(query_string); + size_t i = 0; + while ((i = s.find("group:gmane.", i)) != string::npos) { + i += 6; + s.erase(i, 6); + } + query = qp.parse_query(s); } catch (Xapian::QueryParserError &e) { error_msg = e.get_msg(); return BAD_QUERY; @@ -299,8 +336,6 @@ return SAME_QUERY; } -static multimap filter_map; - typedef multimap::const_iterator FMCI; void add_bterm(const string &term) { @@ -326,11 +361,14 @@ case 1: filter_vec.push_back(Xapian::Query(or_vec[0])); break; - default: - filter_vec.push_back(Xapian::Query(Xapian::Query::OP_OR, + default: { + Xapian::Query::op op = Xapian::Query::OP_OR; + if (current == "A") op = Xapian::Query::OP_AND; + filter_vec.push_back(Xapian::Query(op, or_vec.begin(), or_vec.end())); break; + } } or_vec.clear(); if (over) break; @@ -383,7 +421,7 @@ if (sort_after) { enquire->set_sort_by_relevance_then_value(sort_key, sort_ascending); } else { - enquire->set_sort_by_value_then_relevance(sort_key, sort_ascending); + enquire->set_weighting_scheme(Xapian::BoolWeight()); } } @@ -514,6 +552,61 @@ } static string +html_escape_and_obfuscate(const string &str) +{ + string res; + bool afterat = false; + string::size_type p = 0; + while (p < str.size()) { + char ch = str[p++]; + switch (ch) { + case '<': + res += "<"; + afterat = false; + continue; + case '>': + res += ">"; + afterat = false; + continue; + case '&': + res += "&"; + afterat = false; + continue; + case '"': + res += """; + afterat = false; + continue; + case '@': + if (afterat) { + res += ch; + continue; + } + if (p >= 2) { + char oldch = str[p - 2]; + if (oldch == '@' || isspace((unsigned char)oldch)) { + res += ch; + continue; + } + } + res += " <at> "; // Help defeat address harvesting. + afterat = true; + continue; + case '.': + if (afterat) { + res += " <dot> "; // Help defeat address harvesting. + } else { + res += ch; + } + continue; + default: + if (isspace((unsigned char)ch)) afterat = false; + res += ch; + } + } + return res; +} + +static string html_strip(const string &str) { string res; @@ -649,7 +742,7 @@ match = word_in_list(stem, list); } if (match >= 0) { - res += html_escape(string(l, first.raw() - l)); + res += html_escape_and_obfuscate(string(l, first.raw() - l)); if (!bra.empty()) { res += bra; } else { @@ -662,17 +755,17 @@ res += "\">"; } word = string(first.raw(), j.raw() - first.raw()); - res += html_escape(word); + res += html_escape_and_obfuscate(word); if (!bra.empty()) { res += ket; } else { res += ""; } } else { - res += html_escape(string(l, j.raw() - l)); + res += html_escape_and_obfuscate(string(l, j.raw() - l)); } } - if (j != s_end) res += html_escape(string(j.raw(), j.left())); + if (j != s_end) res += html_escape_and_obfuscate(string(j.raw(), j.left())); return res; } @@ -737,6 +830,7 @@ CMD_find, CMD_fmt, CMD_freq, +CMD_gmaneauthor, CMD_ge, CMD_gt, CMD_highlight, @@ -855,6 +949,7 @@ T(fmt, 0, 0, N, 0), // name of current format T(freq, 1, 1, N, 0), // frequency of a term T(ge, 2, 2, N, 0), // test >= +T(gmaneauthor, 1, 1, N, 0), // Produce HTML from author string for gmane T(gt, 2, 2, N, 0), // test > T(highlight, 2, 4, N, 0), // html escape and highlight words from list T(hit, 0, 0, N, 0), // hit number of current mset entry (starting @@ -1255,6 +1350,67 @@ if (string_to_int(args[0]) >= string_to_int(args[1])) value = "true"; break; + case CMD_gmaneauthor: { + const string & a = args[0]; + if (a.size() > 5) { + if (a[a.size() - 1] == '@') { + string::size_type j = a.rfind('-'); + if (j == string::npos) j = a.rfind('='); + if (j != string::npos) { + value = ""; + value += html_escape(a.substr(0, j)); + value += "@..."; + } + } + } + // See if it ends "@public.gmane.org" or starts with a + // public.gmane.org address. + if (a.size() > 21) { + if (a.substr(a.size() - 18) == "@public.gmane.org>") { + string::size_type i = a.rfind('<'); + string::size_type j = a.rfind('-'); + if (j == string::npos) j = a.rfind('='); + if (i != string::npos && j != string::npos) { + value = html_escape(a.substr(0, i + 1)); + value += ""; + value += html_escape(a.substr(i + 1, j - i - 1)); + value += "@...>"; + } + } else if (a.substr(a.size() - 17) == "@public.gmane.org") { + string::size_type j = a.rfind('-'); + if (j == string::npos) j = a.rfind('='); + if (j != string::npos) { + value = "<"; + value += html_escape(a.substr(0, j)); + value += "@...>"; + } + } else { + size_t p = a.find("@public.gmane.org"); + if (p != string::npos) { + string::size_type j = a.rfind('-', p); + if (j == string::npos) j = a.rfind('=', p); + if (j != string::npos) { + value = "<"; + value += html_escape(a.substr(0, j)); + value += "@...>"; + value += a.substr(p + 17); + } + } + } + } + if (value.empty()) { + value = html_escape_and_obfuscate(a); + } + break; + } case CMD_gt: if (string_to_int(args[0]) > string_to_int(args[1])) value = "true";