#! /usr/bin/env python import sys, os from reference_class import * import tiny_bits say_str = os.path.splitext(os.path.split(__file__)[1])[0] def split_regex( regex_string ): """ #+ # NAME: # split_regex # PURPOSE: # Interpret regular expression # INPUTS: # regex_string string # comma separated list of key=value pairs. # If the key is omitted (i.e. only the value is specified) then # the key name is assumed to be 'key'. # The value is optionally bracketed by single or double quotes # # key names 'key' and 'cat' are used to filter the topkeys list # OUTPUTS: # result dictionary of key-value pairs # EXAMPLE: # input: output: # value1 {'key': 'value1'} # key=value1 {'key': 'value1'} # k=value1,value2 {'k': 'value1', 'key': 'value2'} # # 'cat','key','attr' have special meaning. # All others refer to entries associated with # with publications: author, title, etc. #- """ say = tiny_bits.say('%s.%s'%(say_str,tiny_bits.whoami())) regex_map = dict() if regex_string != '': string = ','+regex_string+',' # Add leading and trailing comma # start with comma # (optional) anything that is not an equal sign or a comma, up to equal sign # double quote, single quote, or null # (optional) anything # closing double quote, single quote, or null # end with comma r = re.compile(r',([^=,]+=)?("|\'|)(.+?)\2,') say.debug( string ) m = r.match(string) while m: key = m.group(1)[0:-1] if m.group(1) else 'key' val = m.group(3) regex_map[key] = val say.debug( "%s -> %s"%(key,val) ) if m.end(0) == len(string): break string = string[m.end(0)-1:] say.debug( string ) m = r.match(string) say.say( 'regex map %s'%regex_map ) return regex_map if __name__ == '__main__': from optparse import OptionParser default_keyword_stack = \ os.path.join( os.environ['ECLIPSE'], 'reports','etc','data-papers.keys' ) if os.environ.has_key('ECLIPSE') else\ os.path.join( os.environ['SMEI'], 'ucsd','gen','etc','solar_system.keys' ) if os.environ.has_key('SMEI' ) else \ '' default_reference_list = \ os.path.join( os.environ['ECLIPSE'], 'reports','etc','data-papers.yaml') if os.environ.has_key('ECLIPSE') else \ os.path.join( os.environ['SMEI'], 'ucsd','gen','etc','solar_system.yaml' ) if os.environ.has_key('SMEI' ) else \ '' version = '0.00' usage = "%prog \n" + \ ("\tdefault ReferenceList: %s\n"%default_reference_list if default_reference_list != '' else '') + \ ("\tdefault KeywordStack : %s\n"%default_keyword_stack if default_keyword_stack != '' else '') + \ "\t--format html-par[=html-root,head-file,tail-file] |\n" + \ "\t html-list[=html-root,head-file,tail-file] |\n" + \ "\t xinc-par | xinc-list | simple | bibtex | raw" parser = OptionParser(usage=usage,version=version) parser.add_option('-v', '--verbose', dest = 'verbose' , action = 'store_true' , default = False , help = 'verbose output' ) parser.add_option('', '--debug' , dest = 'debug' , action = 'store' , type = 'int' , default = 0 , help = 'set debug level' , ) parser.add_option('-n', '--dry-run' , dest = 'dryrun' , action = 'store_true' , default = False , help = 'make dryrun' , ) parser.add_option('', '--start-time', dest = 'start_time' , action = 'store' , type = 'string' , default = None , help = 'use only pubs later than start date YYYY-MM (inclusive)' ) parser.add_option('', '--stop-time' , dest = 'stop_time' , action = 'store' , type = 'string' , default = None , help = 'use only pubs earlier than stop date YYYY-MM (exclusive)' ) parser.add_option('', '--keyword-file' , dest = 'keyword_file' , action = 'store' , type = 'string' , default = default_keyword_stack , help = 'YAML file with categorization of top keys' ) parser.add_option('', '--show-keywords' , dest = 'show_keywords' , action = 'store_true' , default = False , help = 'show layout of keyword categorization' ) parser.add_option('-a', '--attributes' , dest = 'attributes' , action = 'store' , type = 'string' , default = '' , help = 'paper attributes in form attr1=True|False,attr2=True|False' ) parser.add_option('-f', '--find-regex' , dest = 'find_regex' , action = 'store' , type = 'string' , default = '' , help = 'use only refs matching specified regex' ) parser.add_option('-p', '--print-references', dest = 'print_references' , action = 'store_true' , default = False , help = 'print references' ) parser.add_option('', '--format' , dest = 'format' , action = 'store' , type = 'string' , default = 'simple' , help = 'defines format for printing references: simple, raw, bibtex, html-par, html-list, xinc-par, xinc-list' , ) # authors_by_country_detail: 'country of origin of authors on papers using CAIDA data' # papers_by_category_detail: 'published papers using CAIDA data' parser.add_option('', '--title' , dest = 'title' , action = 'store' , type = 'string' , default = '' , help = 'title for html page' , ) parser.add_option('', '--per-year' , dest = 'per_year' , action = 'store_true' , default = False , help = 'print reference count for each year' ) parser.add_option('', '--authors-by-country-summary', dest = 'authors_by_country_summary' , action = 'store_true' , default = False , help = 'write html page to stdout with table for nr of authors per country' ) parser.add_option('', '--authors-by-country-detail' , dest = 'authors_by_country_detail' , action = 'store_true' , default = False , help = 'write table for nr of authors per country per year and month' ) parser.add_option('', '--papers-by-category-detail' , dest = 'papers_by_category_detail' , action = 'store_true' , default = False , help = 'write table with nr of papers per category per year and month' ) parser.add_option('', '--papers-by-feature-detail' , dest = 'papers_by_feature_detail' , action = 'store_true' , default = False , help = 'write table with nr of papers per feature per year and month' ) parser.add_option('', '--papers-summary' , dest = 'papers_summary' , action = 'store_true' , default = False , help = 'writes table with nr of papers per year and month' ) parser.add_option('', '--disjoint-categories' , dest = 'disjoint_cat' , action = 'store_true' , default = False , help = 'count only the first category for a paper' ) options, args = parser.parse_args() pubfile = args[0] if len(args) > 0 else default_reference_list if options.keyword_file == '': parse.error( 'no KeywordStack file specified' ) if pubfile == '': parse.error( 'no ReferenceList file specified' ) say = tiny_bits.say( label = os.path.splitext(os.path.split(__file__)[1])[0] , verbose = max(options.verbose,options.debug) , dryrun = options.dryrun , ) regex_map = split_regex( options.find_regex ) topkeys = KeywordStack( options.keyword_file ) try: #topkeys = KeywordStack( options.keyword_file ) topkeys.filter_by_regex(regex_map) # Retain categories/keywords matching regex # TODO: Need more checking here to make sure that attributes exist in keyword_file topkeys.filter_attributes(options.attributes) # Only retain topkeys with the specified attributes except: raise ReferenceError( '%s, topkeys not available'%tiny_bits.whoami() ) if options.show_keywords: topkeys.keep_zero = True topkeys.keep_attr = True say.yell( topkeys.__str__() ) sys.exit() lst = ReferenceList(pubfile,options.start_time,options.stop_time) # Read YAML file lst = lst.filter_keywords(topkeys) # Restrict keywords lst = lst.filter_by_regex(regex_map) # Restrict by regex if say.DRYRUN: say.done() if options.authors_by_country_summary: lst.set_html_mode (options.format) lst.set_title (options.attributes) lst.authors_by_country_summary() elif options.authors_by_country_detail: lst.set_title(options.title) lst.authors_by_country_detail() elif options.papers_summary: lst.set_html_mode (options.format) lst.set_title (options.attributes) lst.papers_summary(topkeys) elif options.papers_by_category_detail: lst.set_title(options.title) lst.papers_by_category_detail(topkeys, options.disjoint_cat) elif options.papers_by_feature_detail: lst.set_title(options.title) lst.papers_by_feature_detail(topkeys) elif options.print_references: if options.format[0:3] == 'raw': #x = lst.__repr__() #print x[0:2680] print lst.__repr__() elif options.format[0:4] in ['html','xinc']: lst.set_html_mode(options.format[0:4]) if options.format[4:8] == '-par': lst.set_html_paragraph(True) else: # options.format[4:8] == '-list' lst.set_html_list(True) html_head = '' html_tail = '' if options.format[0:4] == 'html' and '=' in options.format: html,files = options.format.split('=') files = files.split(',') if len(files) > 0: lst.set_html_root(files[0]) say.say( "html root directory is '%s'"%files[0] ) if len(files) > 1: html_head = open(files[1]).read()[0:-1] say.say( "html head file is '%s'"%files[1] ) if len(files) > 2: html_tail = open(files[2]).read()[0:-1] say.say( "html tail file is '%s'"%files[2] ) #x = lst.custom_print(html_head,html_tail) #sys.stderr.write( x[613800:613847]+'\n' ) print lst.custom_print(html_head,html_tail) elif options.format == 'bibtex': print lst.print_bibtex(topkeys) elif options.format == 'test': for x in lst.atoms: say.yell( "----------------> %s"%x.hash['MARKER'] ) print x.__repr__() #try: # print x.__repr__() #except: # print "-------------->", x.hash['MARKER'] # sys.exit() else: print lst elif options.per_year: topkeys.count_atoms(lst) for year in range(lst.start_time.get(attr='year'),lst.stop_time.get(attr='year')+1): sub_topkeys = topkeys.copy(year) sub_topkeys.category_only = True sub_topkeys.title = 'in %s'%year print sub_topkeys else: topkeys.count_atoms(lst) topkeys.category_only = False print topkeys sys.exit(0)