#!/usr/bin/python -u ### ### Wrapper script which uses the Swish index/search system to create ### indexes for public and private Mailman mailing lists. Loosely ### designed based on the Mailman arch bin command. ### ### Anthony R. Thompson - January 2010 ### Contact: put @ between art and sigilservices.com ### ### Copyright (C) 1998-2010 by the Free Software Foundation, Inc. ### ### This program is free software; you can redistribute it and/or ### modify it under the terms of the GNU General Public License ### as published by the Free Software Foundation; either version 2 ### of the License, or (at your option) any later version. ### ### This program is distributed in the hope that it will be useful, ### but WITHOUT ANY WARRANTY; without even the implied warranty of ### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ### GNU General Public License for more details. ### ### You should have received a copy of the GNU General Public License ### along with this program; if not, write to the Free Software ### Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA ### 02110-1301, USA. http://www.fsf.org/licensing/licenses/gpl.html ### """Create Swish search indexes for Mailman mailing lists. Performs a complete indexing of Mailman list archives using Swish. Also sets up search CGI and config file if necessary, and makes archive file modification times match message dates to support date searching. Doesn't support incremental indexing as Swish doesn't. Usage: arch_index.py [options] Where options are: -h / --help Print this help message and exit. -q / --quiet Make output from this program and Swish indexer as quiet as possible. -v / --verbose Make output from this program and Swish indexer as verbose as possible. -o / --overwrite Overwrite any existing Swish CGI & configuration files with defaults. -s / --swish=/path/to/swish-e Path to the swish indexer program, if not /usr/bin/swish-e -c / --swish-cgi=/path/to/default/swish.cgi Path to swish.cgi search prog, if not /usr/lib/swish-e/swish.cgi -m / --includemm Include the default mailman list, which is normally ignored. Has no effect if --public is also used (since it's private). -l listname / --list=listname Index a specific list; separate multiple lists with commas. Causes --private, --public, --includemm to be ignored. -x listname / --exclude=listname Exclude a specific list; separate multiple lists with commas. Can be used to prune --private and --public options. --private Only create indexes for lists with private archives. --public Only create indexes for lists with public archives. --preserve-times Do not update message archive file mod times based on message dates. --reset-all-times Change all archive message file mod times based on message dates. Default: only change new archive files since last run. Where is the base archives directory of a Mailman installation, which has public/ and private/ subdirectories. """ import os, sys, getopt, re, subprocess, time, shutil RUNNING_AS = os.geteuid() # effective user id we're running as now PROGRAM = os.path.abspath(sys.argv[0]) # Mailman summary files skipped when updating modification times SKIP_FILES = ['author.html', 'date.html', 'index.html', 'subject.html', 'thread.html', 'attachment.html'] # no dates in attachments HTML_RE = re.compile('\.html$') # lowercase only, no .htm DATETIME_RE = re.compile('^.*<[Ii]>([a-zA-Z]{3}\s+[a-zA-Z]{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+[a-zA-Z]{3}\s+\d{4}).*\n$') # note backref for replace DATETIME_FMT = '%a %b %d %H:%M:%S %Z %Y' # e.g., Sun Nov 1 16:23:45 MST 1998 def usage(code, msg=''): """Given a return code and optional msg, print usage info and the msg. If return code is non-zero, prints to STDERR, otherwise to STDOUT. """ if code: fd = sys.stderr else: fd = sys.stdout print >> fd, __doc__ if msg: print >> fd, msg sys.exit(code) def archived_list_names(archivedir, filter='all'): """Given a Mailman archive directory, return lists with archives. The optional filter parameter can specify all, public, or private to limit which lists are returned; all is the default. """ # check for existence of required directories, base and subdirs privdir = "%s/private" % (archivedir) pubdir = "%s/public" % (archivedir) if not os.path.isdir(privdir): usage(2, "private archive directory %s does not exist" % (privdir)) elif not os.path.isdir(pubdir): usage(2, "public archive directory %s does not exist" % (pubdir)) # In mailman, ALL lists have entries in archives/private, and each # list there also has a listname.mbox dir which we must ignore; to # get truly "private" lists, we must subtract the "public" lists all_lists = [] mbox_re = re.compile('\.mbox$') for item in os.listdir(privdir): if not os.path.isdir("%s/%s" % (privdir, item)): continue if not mbox_re.search(item): all_lists.append(item) if (filter == 'all'): return all_lists pub_lists = [] for item in os.listdir(pubdir): if not item == 'index.html': pub_lists.append(item) if (filter == 'public'): return pub_lists priv_lists = [] for listname in all_lists: if listname not in pub_lists: priv_lists.append(listname) if (filter == 'private'): return priv_lists usage(2, "archived_list_names: unknown filter option %s" % (filter)) def lists_to_index(archive_dir, include_mm=False, specific_list=None, private_only=False, public_only=False, exclude_list=None): """Given an archive directory and various flags, return lists to index""" index_lists = [] if specific_list: all_lists = archived_list_names(archive_dir, 'all') for listname in re.split(',', specific_list): if (listname in all_lists): index_lists.append(listname) elif private_only: index_lists = archived_list_names(archive_dir, 'private') elif public_only: index_lists = archived_list_names(archive_dir, 'public') else: index_lists = archived_list_names(archive_dir, 'all') if ((not include_mm) and ('mailman' in index_lists)): index_lists.remove('mailman') if exclude_list: for listname in re.split(',', exclude_list): if listname in index_lists: index_lists.remove(listname) return index_lists def setup_swish_config(archivedir, listname, public=False, overwrite=False): """Given an archive dir and list, return path to Swish config file. If the Swish config file does not exist, it will be created with default values from this function. If it does exist and the optional overwrite flag is given, it will be overwritten. See: http://wpkg.org/Integrating_Mailman_with_a_Swish-e_search_engine By default this function assumes that the list is private, and thus its web link is foo.com/cgi-bin/mailman/private/listname. Pass public=True for a public list, to use foo.com/pipermail/listname """ # normalize paths so they look pretty in the swish config file :) priv_base = os.path.normpath("%s/private" % (archivedir)) index_root = os.path.normpath("%s/%s" % (priv_base, listname)) index_file= os.path.normpath("%s/%s.swish-index" % (index_root, listname)) index_conf = os.path.normpath("%s/%s.swish-conf" %(index_root,listname)) if public: web_base = '/pipermail' # note no trailing slash else: web_base = '/cgi-bin/mailman/private' # note no trailing slash if ((not os.path.isfile(index_conf)) or overwrite): output = """# Swish config file auto-generated by %s # The swish index file to be created IndexFile %s # Where to index - this directory and all subdirectories IndexDir %s # Only index files with these extensions (.html in this case) IndexOnly .html # Mailman summary files we should ignore: author.html, date.html etc. # Skip attachments because we can't assign meaningful dates to them FileRules filename is (author\.html|date\.html|index\.html|subject\.html|thread\.html|attachment\.html) # Replace real directory with web-accessible (virtual) directory in results ReplaceRules replace "%s" "%s" # Store description in search results - this is for result excerpts IndexContents HTML .html StoreDescription HTML
 200000

# Meta tags to index also
MetaNames swishtitle

# Follow symbolic links - allows following public archive links
FollowSymLinks yes
""" % (PROGRAM, index_file, index_root, priv_base, web_base)
        outfile = open(index_conf, 'w')
        outfile.write(output)
        outfile.close()

    return index_conf



def setup_swish_cgi(archivedir, defaultcgi, overwrite=False):
    """Given an archive dir, ensure master swish.cgi is set up to search.

    This copies a default swish.cgi to the main private archive
    directory and then customizes it point to a customization file in
    the same directory (swish.cgi.conf).  See setup_swish_cgi_config()
    for information on how that is setup.  Returns the location of the
    master swish.cgi script.

    The term "master" is used because one swish.cgi is used to search
    all lists, which is possible through the use of a LISTNAME
    environment variable that the config file references.

    Note that if there is already a swish.cgi file in place, this will
    not overwrite it unless passed overwrite=True.  Also, we assume
    that defaultcgi is already checked as valid.

    """

    newcgi = "%s/private/swish.cgi" % (archivedir)
    newcgiconf = os.path.abspath("%s/private/swish.cgi.conf" % (archivedir))

    if ((not os.path.isfile(newcgi)) or overwrite):

        # first try copying the default cgi to its new, list-specific home
        try: shutil.copyfile(defaultcgi, newcgi)
        except IOError: print >> sys.stderr, "Unable to create %s" % (newcgi)

        # now customize that file
        fh = open(newcgi, 'r')
        orig_contents = fh.readlines()
        fh.close()
        fh = open(newcgi, 'w')
        default_config_re = re.compile('^my \$DEFAULT_CONFIG_FILE')
        template_re = re.compile('SWISH::TemplateDefault')
        for line in orig_contents:
            if template_re.search(line):
                line = template_re.sub('SWISH::TemplateDefault_MM', line)
            if default_config_re.search(line):
                fh.write("my $DEFAULT_CONFIG_FILE = '%s';\n" % (newcgiconf))
            else: fh.write(line)

    return newcgi



def setup_swish_cgi_config(archivedir, swishbin, overwrite=False):
    """Given an archive dir and path to swish, setup swish.cgi config.

    If the config file for swish.cgi does not exist, it will be
    created with default values from this function.  If it does exist
    and the optional overwrite flag is given, it will be overwritten.
    http://wpkg.org/Integrating_Mailman_with_a_Swish-e_search_engine

    This function would normally be run after setup_swish_cgi().
    Returns location of the config file for list's swish.cgi script.

    """

    # normalize paths so they look pretty in the swish config file :)
    privdir = os.path.abspath("%s/private" % (archivedir))
    cgiconf = "%s/swish.cgi.conf" % (privdir)

    if ((not os.path.isfile(cgiconf)) or overwrite):
        output = """# swish.cgi config file auto-generated by %s
return {
    title           => "Search $ENV{'LISTNAME'} archives",
    swish_binary    => '%s',
    swish_index     => '%s/' . $ENV{'LISTNAME'} . '/' . $ENV{'LISTNAME'} . '.swish-index',
    page_size       => 30,
    sorts           => [qw/swishrank swishlastmodified swishtitle/],
    metanames       => [qw/swishdefault swishtitle/],
    display_props   => [qw/swishlastmodified/],
    on_intranet     => 1,   # hide swish images
    name_labels => {
        swishdefault        => 'Entire Message',      # custom
        swishtitle          => 'Subject',             # custom
        swishrank           => 'Search Match',        # custom
        swishlastmodified   => 'Posting Date',        # custom
        swishdocpath        => 'Document Path',       # default
        swishdocsize        => 'Document Size',       # default
        all                 => 'All',                 # default
        subject             => 'Message Subject',     # default
        name                => "Poster's Name",       # default
        email               => "Poster's Email",      # default
        sent                => 'Message Date',        # default
    },
};
""" % (PROGRAM, swishbin, privdir)
        outfile = open(cgiconf, 'w')
        outfile.write(output)
        outfile.close()

    return cgiconf



def get_first_message_datetime(filepath):
    """Return the first date/time string found in the given file.

    Looks for a specific kind date format, that used by Mailman's
    pipermail in creating HTML message archive pages, for example:
    Sat Nov 1 19:38:39 MST 1997. Returns empty string if none.

    For performance, this does NOT check to see if the file is valid.

    """
    result = ''
    msgfh = open(filepath, 'r')
    for line in msgfh.readlines():
        if DATETIME_RE.search(line):
            result = DATETIME_RE.sub('\\1', line)
            break
    msgfh.close()
    return result



def update_mtime_from_message(msgpath):
    """Given a Mailman archive file, update mtime from first date-time in it.

    Return the mtime number used to update the file (i.e., the
    converted date-time string from the message), or None if there was
    no such string or there was an error in converting the string.
    For performance, this does NOT check to see if the file is valid.
    
    """
    dt = get_first_message_datetime(msgpath)
    if dt <> '':

        # If we're not running as owner of file, we won't be able to change
        # the modification time of the file, so we have to engage in some
        # rename-copy-delete hackery to get ownership of the file (blech).
        if (RUNNING_AS <> os.stat(msgpath).st_uid):

            tmppath = msgpath + '.tmp'
            # rename orig message file to temp filename
            try: os.rename(msgpath, tmppath)
            except OSError: print >> sys.stderr, \
                    "%s: unable to rename to %s" % (msgpath, tmppath)
            # now copy temp filename back to orig file to get ownership
            try: shutil.copyfile(tmppath, msgpath)
            except IOError:
                print >> sys.stderr, "%s: unable to copy to %s" % \
                    (tmppath, msgpath)
                # if copy back failed, try to undo rename to restore state
                os.rename(tmppath, msgpath)
            # finally, remove the temp file
            try: os.remove(tmppath)
            except OSError: print >> sys.stderr, \
                    "%s: unable to remove" % (tmppath)

        # Whether we owned the file to start, or had to do hackery to
        # get ownership, we can now change its modification time
        try:
            dtfp = time.mktime(time.strptime(dt, DATETIME_FMT))
            os.utime(msgpath, (dtfp, dtfp))   # sets mtime and atime
            #print "%s: updated mtime to %s" % (msgpath, dt)
            return dtfp
        except OverflowError, ValueError:
            print >> sys.stderr, "%s: bad datetime from msg: %s" % (msgpath,dt)

    else: return None   # if we couldn't get a valid date from the msg file



def update_archive_page_mtimes(listdir, tsfile=None):
    """Given a list archive directory, update mtimes for its message archives.

    If given an optional timestamp file path, only update message
    archive files which are newer than that file.  For each message
    archive file (which ends in .html and is not a summary file like
    author.html, date.html, etc.), call update_mtime_from_message to
    change the modification time of the archive file to the first
    date-time found in the file.

    We do this so that the Swish archive indexer can rely on page
    modification times and thus allow searching by date range.

    """
    if (tsfile):
        tsmtime = os.path.getmtime(tsfile)
        for root, dirs, files in os.walk(listdir):
            for filename in files:
                filepath = os.path.join(root, filename)
                if (HTML_RE.search(filename) and (filename not in SKIP_FILES)
                    and (os.path.getmtime(filepath) > tsmtime)):
                    update_mtime_from_message(filepath)
    else:
        for root, dirs, files in os.walk(listdir):
            for filename in files:
                if HTML_RE.search(filename) and (filename not in SKIP_FILES):
                    update_mtime_from_message(os.path.join(root, filename))



def main():
    """Run the Swish search indexer on one or more Mailman list archives.

    Prints swish command results, and returns a list of mailing lists
    which were successfully indexed, based on swish's return code.

    """

    # get command line arguments
    try:
        opts, args = getopt.getopt(
            sys.argv[1:], 'hqvs:c:oml:x:',
            ['help', 'quiet', 'verbose', 'swish', 'swish-cgi',
             'overwrite', 'includemm', 'list=', 'exclude=', 'private',
             'public', 'preserve-times', 'reset-all-times'])        
    except getopt.error, msg:
        usage(1, msg)

    # set default options
    swish = '/usr/bin/swish-e'
    swishcgi = '/usr/lib/swish-e/swish.cgi'
    quiet = False
    verbose = False
    overwrite = False
    inclmm = False
    speclist = None
    exclist = None
    privonly = False
    pubonly = False
    preservetimes = False
    resettimes = False

    # originally we thought Swish supported incremental indexing;
    # leaving in code to handle that because it might at some point;
    # see http://swish-e.org/docs/swish-faq.html for more info.
    # if we re-enable wipe, we can remove reset-all-times (same)
    wipe = True

    for opt, arg in opts:
        if opt in   ('-h', '--help'): usage(0)
        elif opt in ('-q', '--quiet'): quiet = True
        elif opt in ('-o', '--overwrite'): overwrite = True
        elif opt in ('-s', '--swish'): swish = arg
        elif opt in ('-c', '--swish-cgi'): swishcgi = arg
        elif opt in ('-m', '--includemm'): inclmm = True
        elif opt in ('-l', '--list'): speclist = arg
        elif opt in ('-x', '--exclude'): exclist = arg
        elif opt == '--private': privonly = True
        elif opt == '--public': pubonly = True
        elif opt == '--preserve-times': preservetimes = True
        elif opt == '--reset-all-times': resettimes = True
        #elif opt == '--wipe': wipe = True
        elif opt in ('-v', '--verbose'):
            verbose = True
            quiet = False

    # check whether required arguments are supplied and have good values
    if len(args) < 1:
        usage(1, 'archive directory is required')
    archive_dir = os.path.abspath(args[0])
    if not os.path.isdir(archive_dir):
        usage(1, "archive directory %s does not exist" % (archive_dir))

    if not os.path.isfile(swish):
        usage(1, "unable to find swish indexer program at %s" % (swish))
    if not os.path.isfile(swishcgi):
        usage(1, "unable to find swish.cgi search program at %s" % (swishcgi))

    # figure out what lists we actually need to index
    index_lists = lists_to_index(archive_dir=archive_dir, include_mm=inclmm,
                                 specific_list=speclist, exclude_list=exclist,
                                 public_only=pubonly, private_only=privonly)
    # check if we inadvertantly ended up with no lists to process
    if len(index_lists) == 0: usage(1, 'No lists to index')

    # see if we need to set up swish.cgi for searching, and its config file
    searchcgi = setup_swish_cgi(archive_dir, swishcgi, overwrite)
    cgiconf = setup_swish_cgi_config(archive_dir, swish, overwrite)

    # go through each list and create search index, update msg mtimes, etc.
    indexed = []
    for listname in index_lists:
        listdir = "%s/private/%s" % (archive_dir, listname)
        
        # the timestamp file used to find "new" archive pages
        tsfile = "%s/.arch_index_timestamp" % (listdir)

        # check if we need to update message file modification times
        if not preservetimes:
            #if (not wipe) and os.path.isfile(tsfile):
            if (not resettimes) and os.path.isfile(tsfile):
                update_archive_page_mtimes(listdir, tsfile)
            else: update_archive_page_mtimes(listdir)

        # make sure that a swish config file exists for this list
        if not quiet: print "Setting up swish config file for " + listname
        if (os.path.isdir("%s/public/%s" % (archive_dir, listname))): publ=True
        else: publ = False
        conf_file = setup_swish_config(archive_dir, listname, publ, overwrite)
        cmdlist = [swish, '-c', conf_file]

        # account for --quiet or --verbose option, if given
        if quiet: cmdlist.extend(['-v', '0'])
        elif verbose: cmdlist.extend(['-v', '3'])

        # if no timestamp file yet, behave as if --wipe was given
        #if (not wipe) and os.path.isfile(tsfile):cmdlist.extend(['-N',tsfile])

        # by not reassigning stdout/stderr, we print swish output automatically
        if not quiet: print "Indexing " + listname
        if verbose: print " ".join(cmdlist)
        proc = subprocess.Popen(cmdlist)

        # if swish gave a return code of 0, assume success and save list name
        if (os.waitpid(proc.pid, 0)[1] == 0): indexed.append(listname)

        # create/update timestamp file
        tsfh = open(tsfile, 'a')
        os.utime(tsfile, None)
        tsfh.close()

    if not quiet: 'Indexed: ' + " ".join(indexed)
    return indexed



if __name__ == '__main__':
    main()