#!/usr/bin/python

# mgrep: a tool for identifying line-delimited sections of a file with
# specific textual characteristics.
#
# Copyright (c) 2010 Joseph A Knapka.  Released according to the terms
# of the BSD license: http://www.opensource.org/licenses/bsd-license.php
#
# Example use:
#
# mgrep.py --start="<myTag " --end="</mytag>" --target="foo|baz" myFile.xml
#
# This command looks at each section of myFile.xml delimited
# by lines containing "<myTag " (at the beginning) and "</mytag>"
# (at the end). It prints the entirety of each such section which
# contains any line matching the regexp "foo|baz".
#
# See the usage() function for complete usage information.
#
# TODO:
#
# * It'd be nice to handle nesting of the start and end tags.

import sys
import re

def usage():
  print '''
mgrep.py -- search a file for delimited sections containing specific text.

Version: 1.2 June 2 2010

mgrep.py --start=start_re [--end=end_re] [--target=targ_re] [-o|-v|-V] [-e] [-d[="<divider>"] [--n=N] [--skip=N] [file]

Options (note that single-letter options MAY NOT be combined. -d -v != -dv):

  --start=start_re	Specify the regular expression that marks the beginning
			of a region of interest. All regexp-valued arguments
                        accept Python regexp syntax (similar but not identical
                        to PCRE syntax).

  --end=end_re          Specify the regular expression that marks the end
			of a region of interest. If omitted, defaults to
			start_re, and the final line of each region will
			not be printed with the region. A single line can
			be both the end of one region of interest and the
			start of the next.

  --target=target_re	Specify a regular expression that must be matched
			within a region of interest for the region to be
			printed to standard output. --target may be specified
                        multiple times, in which case ALL target regexps
                        must be present for a region to be emitted, although
                        not necessarily on the same line. (Though see -o, -v,
                        and -V, below.) If omitted, defaults to the match-anything
                        regexp ".*", so that every region delimited by start_re
                        and end_re is printed.

  -o                    Print regions that match ANY of the target_re
                        regexps (this_regexp "or" that_one).  In the
                        absence of -o, ALL target_res must match in order
                        to print a region. (Note that you can get the
                        same effect by using a target RE with the pipe
                        operator: --target="this_re|that_one". Using
                        -o may be easier in some cases.)

  -v			Print only those regions whose contents fail to
			match all of the target_re regexps. Regions
                        containing some, but not all, of the target_re
                        WILL be printed, as will regions containing NONE
                        of the target_res.

  -V			Print only those regions whose contents fail to
			match any of the target_re regexps. A region matching
                        even a single target_re WILL NOT be printed.
                        When only one --target is specified, -v and -V are
                        equivalent.

  -e                    Do not consider end-of-file to be the end of a section
                        of interest. Without -e, any section-of-interest that
                        is being parsed when EOF is encountered WILL be printed,
                        even if an --end expression has not been seen. With -e,
                        an --end expression must be seen before EOF for a
                        section-of-interest to be printed.

  -d[=<divider>]		With no argument, do not print a delimiter "-----"
                        between regions of interest in the output. With an
                        argument, print the divider string between regions.

  --n=N			Specify the number of lines of each selected region
			to print.  By default, the entirety of each selected
			region is printed, except when --end is not specified.

  --skip=N		Specify a number of lines to ignore at the start
			of the input file.

  file			Name of the file to process. If omitted, read from
                        standard input -- mgrep plays well with pipelines.
'''

startTagRe=None
endTagRe=None
targetRes=[]
targetFlags=[False]
invert=0
inFname=None
omitFinal=False
skipLines=0
outLines=-1
writeDiv='-----'
eofCountsAsEnd=True

startArg='--start='
endArg='--end='
targetArg='--target='
invertArg='-v'
reallyInvertArg='-V'
anyMatchArg='-o'
skipArg='--skip='
outArg='--n='
divArg='-d'
noEofArg='-e'

curSect=None

def processLine(line,eof=False):
  '''Process a line.  If eof==True, this is either the last line of the
  file being processed, or else it is a blank line supplied as a dummy
  in order to allow an eof event to be processed. '''
  global curSect,skipLines,targetFlags
  if skipLines>0:
    skipLines-=1
    return
  if curSect is not None:
    curSect.append(line)
    # Is the section complete?
    if endTagRe.search(line) is not None or (eof and eofCountsAsEnd):
      maybeEmit(curSect,targetFlags,invert)
      curSect=None
      targetFlags=[False]*len(targetRes)
    else:
      # Still in section; check for the target(s).
      checkTargets(line)
  if curSect is None:
    # Are we at the start of a section?
    if startTagRe.search(line) is not None:
      curSect=[line]
      targetFlags=[False]*len(targetRes)
      checkTargets(line)

def checkTargets(line):
  ''' See if any of the target REs are present in line. '''
  for ii in range(len(targetRes)):
    m=targetRes[ii].search(line)
    if m is not None:
      targetFlags[ii]=True

def maybeEmit(lines,matchFlags,invert):
  ''' Emit a complete section if the match flags and inversion setting
  permit.'''
  allMatch=True
  someMatch=False
  if lines is None:
    return
  for flag in matchFlags:
    if flag:
      someMatch=True
    else:
      allMatch=False
  noneMatch = not someMatch
  # print '%s all,some,none,some and not all=%s,%s,%s,%s'%(matchFlags,allMatch,someMatch,noneMatch,someMatch and not allMatch)

  # Do the right thing depending on the value of invert.
  #print "writeRegion [%s,%s,%s,%s][%s]"%(allMatch,someMatch and not allMatch,noneMatch,someMatch,invert)
  writeRegion(lines,[allMatch,
                     not allMatch, # Handle -v vs -V
                     noneMatch,someMatch][invert])

def writeRegion(lines,emit):
  ''' Details of writing out a region of interest. '''
  if emit:
    if omitFinal:
      lines=lines[:-1]
    if outLines>0:
      lines=lines[:outLines]
    if writeDiv:
      print writeDiv
    for line in lines:
      print line,

def stripQuotes(s):
  ''' Remove double-quotes at the start and end of a string. '''
  if s[0]=='"': return stripQuotes(s[1:])
  if s[-1]=='"': return stripQuotes(s[:-1])
  return s

def parseArg(arg):
  ''' Parse a single command-line argument. '''
  global startTagRe,endTagRe,targetRe,invert,inFname,skipLines,outLines,\
	writeDiv
  if arg[:len(startArg)]==startArg:
    startTagRe=re.compile(stripQuotes(arg[len(startArg):]))
  elif arg[:len(endArg)]==endArg:
    endTagRe=re.compile(stripQuotes(arg[len(endArg):]))
  elif arg[:len(targetArg)]==targetArg:
    targetRes.append(re.compile(stripQuotes(arg[len(targetArg):])))
  elif arg==invertArg:
    invert=1
  elif arg==reallyInvertArg:
    invert=2
  elif arg==anyMatchArg:
    invert=3
  elif arg[:len(skipArg)]==skipArg:
    skipLines=int(arg[len(skipArg):])
  elif arg[:len(outArg)]==outArg:
    outLines=int(arg[len(outArg):])
  elif arg[:len(divArg)]==divArg:
    writeDiv=getDivString(arg)
  elif arg[:len(noEofArg)]==noEofArg:
    eofCountsAsEnd=False
  else:
    inFname=arg

def getDivString(arg):
  if arg[:3]=='-d=':
    return stripQuotes(arg[3:])
  return False

def parseArgs(args):
  ''' Parse the command-line arguments. '''
  global endTagRe,omitFinal,targetRes
  for arg in args:
    parseArg(arg)
  if endTagRe is None:
    endTagRe=startTagRe
    omitFinal=True
  if startTagRe is None:
    usage()
    sys.exit(0)
  if not targetRes:
    targetRes=[re.compile('.*')]

# Main entry point.
if __name__=='__main__':
  parseArgs(sys.argv[1:])
  inf=None
  if inFname is not None:
      inf=open(inFname)
  else:
      inf=sys.stdin
  for line in inf.xreadlines():
    processLine(line)
  # Make sure we emit any region that is of interest, but is
  # incomplete when EOF is seen.
  processLine('',True)