import sys, os, re, time import yaml from copy import deepcopy from math import ceil import tiny_bits from eon_date import eon_date from general_lists import list_of_top_level_domains, table_by_month # AUTHOR and TITLE are always present # # TYPE is optional, but better to include it. It provides an explicit # mechanism for classifying papers. It is not necessarily used to format # the publication info into a specific reference format. # # Possible values: # # in_journal = journal article # in_proceedings = paper in proceedings for # conf, workshop, symp, etc. # in_book = contribution to book (usually a chapter) # in_news = piece in newsletter-type publication # (e.g. EOS) # in_thesis = used for co-authors in chapters in thesis # PhD/MSc/BSc thesis = thesis # Tech./Sci./Int. report = "technical", "scientific", "internal" report # (typically used for internal documents) # abstract = used when only an abstract is available # (e.g. in EOS) # book = for whole book (either as author of a whole # book, or as editor of collection of papers) # proceedings = for whole proceedings (as editor) # online = used for material published on-line only # presentation = oral presentation (usually links to slides) # unpublished = unpublished material (last resort if none # of the above fit # book review = book review # editorial = editorial (used for solar physics editorials) # # AUTHOR : comma separated list of last name and initials # Name1, A., Name2, B., Name3, C. # Always has an even number of comma-separated elements # TITLE : Title of paper # g_REF_KEYS = { 'MARKER' : { 'mandatory' : True , 'order' : 0 } , 'TYPE' : { 'mandatory' : True , 'order' : 1 } , 'AUTHOR' : { 'mandatory' : False, 'order' : 2 } , # True For bibtex? 'GEOLOC' : { 'mandatory' : False, 'order' : 3 } , 'AFFIL' : { 'mandatory' : False, 'order' : 4 } , 'TITLE' : { 'mandatory' : True , 'order' : 5 } , # For bibtex 'EDITOR' : { 'mandatory' : False, 'order' : 6 } , 'CTITLE' : { 'mandatory' : False, 'order' : 7 } , 'SERIAL' : { 'mandatory' : False, 'order' : 8 } , 'CHAPTER' : { 'mandatory' : False, 'order' : 9 } , 'VOLUME' : { 'mandatory' : False, 'order' : 10 } , 'PAGE' : { 'mandatory' : False, 'order' : 11 } , 'ARTICLE' : { 'mandatory' : False, 'order' : 12 } , 'YEAR' : { 'mandatory' : True , 'order' : 13 } , 'PUBLISH' : { 'mandatory' : False, 'order' : 14 } , 'PLACE' : { 'mandatory' : False, 'order' : 15 } , 'DOI' : { 'mandatory' : False, 'order' : 16 } , 'URL' : { 'mandatory' : False, 'order' : 17 } , 'TOPKEY' : { 'mandatory' : False, 'order' : 18 } , 'FUNDING' : { 'mandatory' : False, 'order' : 19 } , 'FILE' : { 'mandatory' : False, 'order' : 20 } , 'ABS' : { 'mandatory' : False, 'order' : 21 } , 'REMARK' : { 'mandatory' : False, 'order' : 30 } , 'REMARK0' : { 'mandatory' : False, 'order' : 31 } , 'REMARK1' : { 'mandatory' : False, 'order' : 32 } , 'REMARK2' : { 'mandatory' : False, 'order' : 33 } , } # Gives name of calling function # used to set up message when raising exception #def whoami(): Moved to tiny_bits # return sys._getframe(1).f_code.co_name class ReferenceError(Exception): def __init__(self, message): self.message = message def __str__(self): return self.message class Author( object ): def __init__(self, timestamp, author): self.feature = { 'details': author, } self.timestamp = timestamp return class AuthorList( object ): def __init__(self, ref_list): self.atoms = [ Author( x.timestamp, author ) for x in ref_list.atoms for author in x.feature['author'] ] return class Reference( object ): # TODO: Highlight single author with # TODO: Create btx entry def __init__(self, hash): self.hash = hash # Should be turned into author objects (author,editor) # and keyword stack object (topkey)??? self.feature = { 'author': self.split_names () , # Can be empty, but never is (would fail validation) 'editor': self.split_names ('EDITOR'), # Can be empty 'topkey': self.split_topkey() , # Can be empty 'remark': self.all_remarks () , # Can be empty } self.validate() # If the month is not specified self.time is set to Jan 1 of the year # and self.has_month is set to False. self.has_month = len(hash['YEAR']) == 7 and '-00' not in hash['YEAR'] year = hash['YEAR'] if self.has_month else hash['YEAR'][0:4]+'-01' self.timestamp = eon_date(year,format='YYYY-MM') self.bibtex = None return def __repr__(self): str = '' for key in sorted( g_REF_KEYS.keys(), key=lambda k: g_REF_KEYS[k]['order'] ): if self.hash.has_key(key): str += "%10s: %s\n"%(key,self.hash[key]) if self.bibtex != None: str += "%10s: %s\n"%('BIBTEX',self.bibtex) return str #

# 2002-10 # C. Estan and G. Varghese
# New directions in traffic measurement and accounting
# in: SIGCOMM Conf.
# ACM SIGCOMM Computer Communication Review 32 (4), 323-336, October 2002
#Keys: passive-oc48
# Remark: No acknowledgement or reference to dataset
# #

def __str__(self): return self.custom_print() def custom_print(self, print_attribute=dict()): html_par = print_attribute['html_paragraph'] if print_attribute.has_key('html_paragraph' ) else False html_list = print_attribute['html_list' ] if print_attribute.has_key('html_list' ) else False html_root = print_attribute['html_root' ] if print_attribute.has_key('html_root' ) else '' year, month = self.timestamp.get(attr=['year','month']) hash = self.hash str = '' if html_par: soft_break = '\n' hard_break = '
'+soft_break if html_par else soft_break str += '

' + \ soft_break + \ ''%hash['MARKER'] + \ '' + \ '%s-%s'%(year,'%02d'%month[0] if self.has_month else '00') + \ soft_break elif html_list: soft_break = '' hard_break = ', ' str += '

' else: soft_break = '\n' hard_break = soft_break # 1. author names line = self.join_names(key='AUTHOR',html=html_par or html_list) str += ('%s'%line if html_par else line)+hard_break # 2. paper title line = hash['TITLE'] str += ( line if html_par else '"%s"'%line )+hard_break page_and_year_needed = False page_and_year_used = False volume_needed = False if hash.has_key('CTITLE'): # 3. conference proceedings/book str += 'in: ' # 3a. editor(s) of proceedings if hash.has_key('EDITOR'): str += self.join_names(key='EDITOR',html=html_par or html_list)+' (ed'+['','s'][len(self.feature['editor']) > 1]+'.), ' # 3b. title of proceedings or book line = hash['CTITLE'] for cmark in ['Conf. on','Workshop on','Symp. on']: if cmark in line: cpos = line.find(cmark)+len(cmark)+1 line = line[0:cpos]+'"'+line[cpos:]+'"' str += line # 3c chapter of book if hash.has_key('CHAPTER'): line = hash['CHAPTER'] str += ', %s %s'%( line.split('=') if '=' in line else ('Ch.',line) ) # Conf proceedings could still be published in a 'serial' publication, e.g. # Springer Lecture Notes. If there is, then start a new line for the # serial publication. if hash.has_key('SERIAL'): str += hard_break page_and_year_needed = True volume_needed = True if hash.has_key('SERIAL'): str += hash['SERIAL'] page_and_year_needed = True volume_needed = True # The volume comes after the conf title (if no serial publication is specified), # or after the serial publication if volume_needed and hash.has_key('VOLUME'): line = hash['VOLUME'] str += ' '+line+'' if html_par else ' Vol. '+line # The page gets printed here if the conf title and/or serial publication # has been printed already. If these are not present, the page numbers get # printed with the publ. type after the publisher. if page_and_year_needed: if hash.has_key('PAGE') or hash.has_key('ARTICLE'): line = hash['PAGE'] if hash.has_key('PAGE') else hash['ARTICLE'] str += ( ' ' if line[0] == '(' else ', ' if hash.has_key('VOLUME' ) else ', art. ' if hash.has_key('ARTICLE') else ', p. ' )+line str += '%s %s'%(', '+month[1] if self.has_month else ',',year)+hard_break page_and_year_used = True # Start new line with publisher # If VOLUME is present, and it was not used after CTITLE or SERIAL # then use it here after PUBLISH (used for thesis and tech report) if hash.has_key('PUBLISH'): str += hash['PUBLISH'] if not volume_needed and hash.has_key('VOLUME'): str += ' ('+hash['VOLUME']+')' if page_and_year_used: str += hard_break # Append page, year and type to publisher if not used yet if not page_and_year_used: line = hash['TYPE'] str += '%s %s'%(', '+month[1] if self.has_month else ',',year) + \ ( '; %s %s'%(line,hash['PAGE']) if hash.has_key('PAGE') else '; %s %s'%(line,hash['ARTICLE']) if hash.has_key('ARTICLE') else '; %s'%line ) + \ hard_break if html_par: str += 'Keys: %s'%', '.join(self.feature['topkey'])+hard_break if hash.has_key('FUNDING'): str += 'Funding: %s'%hash['FUNDING']+hard_break if html_root != '': if os.path.isfile( os.path.join(html_root,'abstract','%s.html'%hash['MARKER']) ): str += '

'%hash['MARKER']+soft_break if os.path.isfile( os.path.join(html_root,'pdf_private','%s.pdf'%hash['MARKER']) ): str += '

'%hash['MARKER']+soft_break elif os.path.isfile( os.path.join(html_root,'pdf_public','%s.pdf'%hash['MARKER']) ): str += '

'%hash['MARKER']+soft_break if hash.has_key('URL'): str += '

'+soft_break if hash.has_key('DOI'): line = hash['DOI'] str += ('

' if html_par else 'doi: '+line)+soft_break if hash.has_key('ABS'): if html_par: cols = 80 rows = long(ceil(1.0*len(hash['ABS'])/cols)) str += ''+soft_break+ \ '' +soft_break elif not html_list: str += '--- '+hash['ABS']+soft_break str += '

' if html_par else '

' if html_list else '' return str+soft_break def print_bibtex(self, keywords): hash = self.hash type = hash['TYPE'] btx_type = 'ARTICLE' if type == 'in_journal' else \ 'INPROCEEDINGS' if type == 'in_proceedings' else \ 'INPROCEEDINGS' if type == 'in_book' else \ 'PHDTHESIS' if type == 'PhD thesis' else \ 'MASTERSTHESIS' if type == 'MSc thesis' else \ 'TECHREPORT' if type == 'tech. report' else \ 'TECHREPORT' if type == 'BSc thesis' else \ 'TECHREPORT' if type == 'on line' else \ 'TECHREPORT' if type == 'class report' else \ 'MISC' #if ctitle[0] == '=': # prefix, ctitle = ctitle[1:].split('=') # Keywords TITLE and YEAR are always present # (tested in method Reference.validate()) # Keyword AUTHOR is absent only for type='proceedings', i.e. if a # conf. proceedings is listed as a whole with only the EDITOR field # present. str = '@%s{\n%s,\n Author = {%s},\n Title = {{%s}},\n'%( btx_type, self.bibtex, self.join_names(key='AUTHOR' if hash.has_key('AUTHOR') else 'EDITOR',reverse=True,separator=[' and ',' and ']), hash['TITLE'] ) if btx_type == 'ARTICLE': # Required fields: author, title, journal, year # Optional fields: volume, number, pages, month, note, key #@ARTICLE{ #WZGW2013, # Author = {Wang, F. and Zhang, Y. and Guo, H. and Wang, C.}, # Title = {{Combating good point set scanning-based self-learning worms by using predators}}, # Journal = {J. Network Security}, # Volume = {15}, # Number = {1}, # Pages = {141-148}, # Note = {}, # Keywords= {Witty Worm}, # Month = {January}, # Year = {2013} #} if hash.has_key('SERIAL'): str += ' Journal = {%s},\n'%hash['SERIAL'] else: raise ReferenceError( "%s,\n%s\nmandatory key SERIAL missing"%(tiny_bits.whoami(),self.hash) ) if hash.has_key('VOLUME'): volume = hash['VOLUME'] if '(' in volume and ')' in volume: str += ' Volume = {%s},\n Number = {%s},\n'%(volume[0:volume.find('(')].strip(),volume[volume.find('(')+1:volume.find(')')].strip()) else: str += ' Volume = {%s},\n'%volume if hash.has_key('PAGE'): str += ' Pages = {%s},\n'%hash['PAGE'] elif btx_type == 'INPROCEEDINGS': # Required fields: author, title, booktitle, year # Optional fields: editor, volume/number, series, pages, address, month, organization, publisher, note, key #@INPROCEEDINGS{ #HDSSP2013, # Author = {Hofstede, R. and Drago, I. and Sperotto, A. and Sadre, R. and Pras, A.}, # Title = {{Measurement artifacts in netFlow data}}, # BookTitle = {Conf. Passive and Active Measurement (PAM)}, # Keywords= {Passive}, # Month = {March}, # Year = {2013} #} if hash.has_key('CTITLE'): str += ' BookTitle = {%s},\n'%(hash['CTITLE']+(', Ch. '+hash['CHAPTER'] if hash.has_key('CHAPTER') else '')) else: raise ReferenceError( "%s,\n%s\nmandatory CTITLE missing"%(tiny_bits.whoami(),self.hash) ) if hash.has_key('EDITOR'): str += ' Editor = {%s},\n'%self.join_names(key='EDITOR',reverse=True,separator=[' and ',' and ']) if hash.has_key('SERIAL'): str += ' Series = {%s},\n'%hash['SERIAL'] if hash.has_key('VOLUME'): str += ' Volume = {%s},\n'%hash['VOLUME'] #volume = hash['VOLUME'] #if '(' in volume and ')' in volume: # str += ' Volume = {%s},\n Number = {%s},\n'%(volume[0:volume.find('(')].strip(),volume[volume.find('(')+1:volume.find(')')].strip()) #else: # str += ' Volume = {%s},\n'%volume if hash.has_key('PAGE'): str += ' Pages = {%s},\n'%hash['PAGE'] if hash.has_key('PUBLISH'): str += ' Publisher = {%s},\n'%hash['PUBLISH'] #if hash.has_key('PLACE'): # str += ' Address = {' +hash['PLACE'] +'},\n' elif btx_type == 'INBOOK': # Required fields: author/editor, title, chapter/pages, publisher, year # Optional fields: volume/number, series, type, address, edition, month, note, key if hash.has_key('CTITLE'): str += ' BookTitle = {%s},\n'%(hash['CTITLE']+(', Ch. '+hash['CHAPTER'] if hash.has_key('CHAPTER') else '')) else: raise ReferenceError( "%s,\n%s\nmandatory CTITLE missing"%(tiny_bits.whoami(),self.hash) ) if hash.has_key('EDITOR'): str += ' Editor = {%s},\n'%self.join_names(key='EDITOR',reverse=True,separator=[' and ',' and ']) if hash.has_key('SERIAL'): str += ' Series = {%s},\n'%hash['SERIAL'] if hash.has_key('VOLUME'): volume = hash['VOLUME'] if '(' in volume and ')' in volume: str += ' Volume = {%s},\n Number = {%s},\n'%(volume[0:volume.find('(')].strip(),volume[volume.find('(')+1:volume.find(')')].strip()) else: str += ' Volume = {%s},\n'%volume if hash.has_key('PAGE'): str += ' Pages = {%s},\n'%hash['PAGE'] if hash.has_key('PUBLISH'): str += ' Publisher = {%s},\n'%hash['PUBLISH'] elif btx_type in ['PHDTHESIS','MASTERSTHESIS']: # Required fields: author, title, school, year # Optional fields: type, address, month, note, key #@PHDTHESIS{ #S2011c, # Author = {Schear, N.}, # Title = {{Preventing encrypted traffic analysis}}, # School = {Univ. Illinois at Urbana-Champagne}, # Note = {}, # Keywords= {Passive}, # Month = {January}, # Year = {2011} #} if hash.has_key('PUBLISH'): str += ' School = {%s},\n'%hash['PUBLISH'] else: raise ReferenceError( "%s,\n%s\nmandatory PUBLISH missing"%(tiny_bits.whoami(),self.hash) ) elif btx_type == 'TECHREPORT': # Required fields: author, title, institution, year # Optional fields: type, number, address, month, note, key #@TECHREPORT{ #S2012a, # Author = {Sherry, J.}, # Title = {{Future architectures for middlebox processing services on the Internet and in the Cloud}}, # Institution = {UC Berkeley}, # Note = {}, # Keywords= {Topology AS Relationships}, # Month = {December}, # Year = {2012} #} if hash.has_key('PUBLISH'): str += ' Institution = {%s},\n'%hash['PUBLISH'] else: raise ReferenceError( "%s,\n%s\nmandatory PUBLISH missing"%(tiny_bits.whoami(),self.hash) ) elif btx_type == 'MISC': if hash.has_key('PUBLISH'): str += ' Institution = {%s},\n'%hash['PUBLISH'] if hash .has_key('DOI'): str += ' Note = {},\n'%hash['DOI'] elif hash.has_key('URL'): str += ' Note = {<%s>},\n'%hash['URL'] str += ' Keywords= {%s},\n Month = {%s},\n Year = {%s}\n}'%( ' '.join( keywords.used(self).keys()), self.timestamp.get(format='Month'), self.timestamp.get(format='YYYY' ) ) return str def validate(self): # Make sure all keys are on the g_REF_KEYS list bad = [ key for key in self.hash if not g_REF_KEYS.has_key(key) ] if bad: print bad raise ReferenceError( "%s, invalid key(s) '%s'\n%s"%(tiny_bits.whoami(),','.join(bad),self.hash) ) # Check for mandatory keys mandatory = [ key for key in g_REF_KEYS if g_REF_KEYS[key]['mandatory'] ] missing = ','.join( [ key for key in mandatory if not self.hash.has_key(key) ] ) if len(missing) > 0: raise ReferenceError( "%s,\n%s\nmissing types '%s'"%(tiny_bits.whoami(),self.hash,missing) ) return #+ # NAME: # Reference.split_names # PURPOSE: # Converts AUTHOR or EDITOR fields in reference hash # to a list of hashes with first and last name of authors # CALLING SEQUENCE: # result = ref.split_names( key ) # INPUTS: # ref Reference object # OPTIONAL INPUTS: # key string; 'AUTHOR' or 'EDITOR' # if omitted then 'AUTHOR' is assumed. # OUTPUTS: # results list of hashes with two keys: 'last' and 'first' # PROCEDURE: # 'AUTHOR and 'EDITOR' field are stored as a single string # in ref.hash['AUTHOR'] and ref.hash['EDITOR'] in the form: # last-1, first-1; last-2, first-2; ... or # last-1, first-1, last-2, first-2, ... # The second form is supported for backward compatibility # (will be phased out at some point). #- def split_geoloc(self): geoloc = [] if self.hash.has_key('GEOLOC'): line = self.hash['GEOLOC'].split(';') for entry in line: if 'unknown' in entry: loc = { 'country': 'unknown' } else: pieces = entry.split(',') loc = { 'city': pieces[0].strip(), 'country': pieces[-1].strip() } if len(pieces) == 3: loc['state'] = pieces[1].strip() geoloc.append( loc ) return geoloc def split_names(self, key='AUTHOR'): author = [] if self.hash.has_key(key): line = [] for x in self.hash[key].split(';'): line.extend( x.split(',') ) if len(line)%2 != 0: raise ReferenceError( '%s, error in %s record\n%s'%(tiny_bits.whoami(),key,self.hash[key]) ) author = [ { 'last': line[i].strip(), 'first': line[i+1].strip() } for i in range(0,len(line),2) ] if key == 'AUTHOR': geoloc = self.split_geoloc() if len(geoloc) == len(author): for n in range(len(author)): for k in geoloc[n]: author[n][k] = geoloc[n][k] elif len(geoloc) == 1: for n in range(len(author)): for k in geoloc[0]: author[n][k] = geoloc[0][k] elif len(geoloc) > 0: raise ReferenceError( '%s,\n%s\n# geolocs does not match # authors'%(tiny_bits.whoami(),self.hash) ) return author def split_topkey(self): topkey = [] if self.hash.has_key('TOPKEY'): topkey = list(set( [ x.strip() for x in self.hash['TOPKEY'].split(',') ] )) return topkey def join_names(self, key='AUTHOR', reverse=False, html=False, separator=[', ', ' and ']): #authors = self.feature['editor'] if key == 'EDITOR' else self.feature['author'] authors = self.split_names(key) line = '' for author in authors: if line != '': line += separator[ author == authors[-1] ] first = author['first'] last = author['last' ] if reverse: if html: if '<<' in first: first = first.replace('<<','' ) if '>>' in first: first = first.replace('>>','') if '<<' in last: last = last.replace('<<','' ) if '>>' in last: last = last.replace('>>','') line += last+', '+first else: if '<<' in last: if '>>' in last: last = last.replace('<<','' ) last = last.replace('>>','') else: last = last.replace('<<','') last += '' if '>>' in first: if '<<' in first: first = first.replace('<<','' ) first = first.replace('>>','') else: first = first.replace('>>','') first = ''+first line += first+' '+last return line def all_remarks(self): return [ self.hash[x] for x in self.hash if 'REMARK' in x ] def matches_regex( self, regex_map ): say = tiny_bits.say('Reference.%s'%tiny_bits.whoami()) for key in regex_map: KEY = key.upper() if KEY != 'KEY' and KEY != 'CAT': if not self.hash.has_key(KEY): return False #raise ReferenceError( "%s, ref has no key '%s'\n%s'"%(tiny_bits.whoami(),KEY,self.__repr__()) ) val = self.hash[KEY] m = re.search(regex_map[key],val) if not m: say.message( "'%s' does not match '%s'\n%s"%(regex_map[key],val,self.__repr__()), 3 ) return False self.hash[KEY] = val[0:m.start()]+'<<'+val[m.start():m.end()]+'>>'+val[m.end():] return True class ReferenceList(): def __init__(self, input, start_time=None, stop_time=None ): say = tiny_bits.say('ReferenceList.%s'%tiny_bits.whoami()) if type( input ) == type( "str" ): # Read yaml file # Read list of references; creates list of reference objects. # Need way to select subsets say.say('read '+input) handle = open( input ) self.atoms = [ Reference( x ) for x in yaml.load_all(handle) ] handle.close() self.yaml_file = input self.print_attribute = dict() self.has_bibtex = False elif isinstance( input, (ReferenceList) ): self.atoms = deepcopy( input.atoms ) self.yaml_file = input.yaml_file self.print_attribute = input.print_attribute self.has_bibtex = input.has_bibtex else: # Neither yaml file, nor dict raise ReferenceError( "%s, cannot init from %s"%(tiny_bits.whoami(), type( input)) ) say.say('number of references is %s'%len(self.atoms)) self.generate_bibtex() self.filter_times(start_time, stop_time) return def filter_times(self, start_time=None, stop_time=None): nrefs = len(self.atoms) # Only retain entries between start and stop time if start_time != None and not isinstance(start_time, (eon_date)): start_time = eon_date( start_time ) if stop_time != None and not isinstance(stop_time, (eon_date)): stop_time = eon_date( stop_time ) self.atoms = [ x for x in self.atoms if x.timestamp.between(start_time,stop_time) ] if start_time != None or stop_time != None: say = tiny_bits.say('ReferenceList.%s'%tiny_bits.whoami()) say.say( '%s/%s references since %s' %(len(self.atoms),nrefs,start_time.get(format='YYYY-MM')) if stop_time == None else \ '%s/%s references before %s' %(len(self.atoms),nrefs, stop_time.get(format='YYYY-MM')) if start_time == None else \ '%s/%s references between %s and %s'%(len(self.atoms),nrefs,start_time.get(format='YYYY-MM'),stop_time.get(format='YYYY-MM')) ) # Set instance variables self.start_time = min( [ x.timestamp for x in self.atoms ] ).bop('month') if start_time == None else start_time self.stop_time = max( [ x.timestamp for x in self.atoms ] ).eop('month') if stop_time == None else stop_time return def __repr__(self): # Calls Reference.__repr__ for each Reference object return '\n'.join( [ '%s'%x.__repr__() for x in self.atoms ] ) def __str__(self): # Calls Reference.__str__ for each Reference object return '\n'.join( [ '%s'%x.__str__() for x in self.atoms ] ) def len(self): return len(self.atoms) def set_html_paragraph(self, value=False ): self.print_attribute['html_paragraph'] = value; return def set_html_list(self, value=False ): self.print_attribute['html_list'] = value; return def set_html_root(self, value='' ): self.print_attribute['html_root'] = value; return def set_html_mode(self, value='' ): # '', 'xinc', 'html' self.print_attribute['html_mode'] = value; return def set_title(self, value='' ): if value != '': self.print_attribute['title'] = value; return def custom_print(self, html_head='', html_tail=''): plain_html = self.print_attribute.has_key('html_mode') and self.print_attribute['html_mode'] == 'html' str = '' if plain_html: str += html_head if html_head != '' else \ '\n' + \ '\n' + \ '\n' + \ '\n' + \ '\n' + \ '\n' + \ '

\n' if len(str) > 0 and 'function TextareaOnOff' not in str: pos = str.find('') if pos == -1: raise ReferenceError( "%s, not found in '%s'"%(tiny_bits.whoami(), str) ) str = str[0:pos]+ \ '\n' + \ str[pos:] str += '\n'.join( [ '%s'%x.custom_print( self.print_attribute ) for x in self.atoms ] )+'\n' if plain_html: str += html_tail if html_tail != '' else "

\n\n" return str def print_bibtex(self, keywords=None ): str = '\n'.join( [ '%s'%x.print_bibtex( keywords ) for x in self.atoms ] )+'\n' return str def generate_bibtex(self): say = tiny_bits.say('ReferenceList.%s'%tiny_bits.whoami()) if self.has_bibtex: # Bibtex keys have been generated already return # Add bibtex entry to all publications # This must be a method for ReferenceList (not Reference) because # bibtex keys must be unique across the whole reference list nrefs = len(self.atoms) bibtex = [] for iref in range(nrefs): reference = self.atoms[iref] # The default key is generated from the first letter in the last name # of all authors, plus the year. If that doesn't make the key unique # start appending letters a,b,c,... key = '' # Append 1st letter of last name for author in reference.feature['author'] if reference.feature['author'] else reference.feature['editor']: key += 'S' if 'vestka' in author['last'] else \ author['last'][4] if author['last'][0:3] == 'van ' else \ author['last'][0] key += '%s'%reference.timestamp.get(attr='year') # Append year n = bibtex.count(key) if n > 1: raise ReferenceError( "%s, duplicate bibtex key '%s' constructed"%(tiny_bits.whoami(),key) ) # If shortest key is already in use, then a single letter is appended. if n == 1: # The shortest key is already in use. Change the existing key # by appending the letter 'a'. if bibtex.count(key+'a') > 0: raise ReferenceError( "%s, bibtex key '%s' already exists"%(tiny_bits.whoami(),key+'a') ) n = bibtex.index(key) say.message( "duplicate key '%s' -> '%sa'"%(key,key), 3 ) bibtex[n] += 'a' self.atoms[n].bibtex = bibtex[n] if bibtex.count(key+'a') > 0: alfabet = 'abcdefghijklmnopqrstuvwxyz' for letter in alfabet: if bibtex.count(key+letter) == 0: key += letter break else: raise ReferenceError( "%s, unable to construct bibtex key from '%s[%s]'"%(tiny_bits.whoami(),key,alfabet) ) bibtex.append(key) reference.bibtex = key self.has_bibtex = True return #+ # NAME: # ReferenceList.filter_keywords # PURPOSE: # Trim list of references to those matching the specified keywords # CALLING SEQUENCE: # result = ref_lst.filter_keywords(fstack) # INPUTS: # ref_lst ReferenceList object # fstack KeywordStack object # OUTPUTS: # result ReferenceList object containg reference in 'ref_lst' # that match keywords in 'fstack' #- def filter_keywords(self, keywords=None): sublist = deepcopy( self ) if keywords != None: sublist.atoms = [] for ref in self.atoms: used = keywords.used(ref) # Prints warning for --debug 2 if there is no match if used.keys(): ref_copy = deepcopy( ref ) keys = used.category_by_feature() ref_copy.hash['TOPKEY'] = ','.join( keys ) ref_copy.feature['topkey'] = keys sublist.atoms.append( ref_copy ) say = tiny_bits.say('ReferenceList.%s'%tiny_bits.whoami()) say.say('%s/%s references'%(len(sublist.atoms),len(self.atoms))) return sublist def filter_by_regex(self, regex_map): sublist = deepcopy( self ) if regex_map: sublist.atoms = [ x for x in self.atoms if x.matches_regex(regex_map) ] say = tiny_bits.say('ReferenceList.%s'%tiny_bits.whoami()) say.say('%s/%s references'%(len(sublist.atoms),len(self.atoms))) return sublist def authors_by_country_summary(self): world = WorldStack( self ) author_list = AuthorList( self ) world.count_atoms(author_list) cat = world.keys()[0] countries = world.keys(cat) count_by_country = dict( zip( countries, [ world.total(cat,x) for x in countries ] ) ) authors = AuthorStack( self ) authors.count_atoms(author_list) cat = authors.keys()[0] names = authors.keys(cat) count_by_author = dict( zip( names, [ authors.total(cat,x) for x in names ] ) ) # Sort into order of decreasing number of author per country, and papers per author countries = sorted( count_by_country, key=lambda abbrev: count_by_country[abbrev], reverse=True ) authors = sorted( count_by_author , key=lambda abbrev: count_by_author [abbrev], reverse=True ) #for key in countries: # print '%3s:%4d'%(key,count_by_country[key]) #for key in authors: # print '%-30s:%3s'%(key,count_by_author[key]) nr_of_pubs = len(self.atoms) nr_of_countries = len(countries ) nr_of_authors = len(authors ) html_mode = self.print_attribute['html_mode'] if self.print_attribute.has_key('html_mode') else '' title = ' ('+self.print_attribute['title']+')' if self.print_attribute.has_key('title') else '' title_main = 'External data papers: authors per country and papers per author'+title title_for_country_table = "Number of authors per country"+title title_for_author_table = 'Number of papers by most prolific authors'+title current_time = time.strftime('%Y-%b-%d %H:%M:%S',time.gmtime())+' UTC' if html_mode == 'xinc': content = [ '' , 'Last updated on '+current_time+'' , '
' , 'There are %s papers with %s different authors in %s countries
'%(nr_of_pubs,nr_of_authors,nr_of_countries), 'The average number of authors per paper is %.1f
'%(sum(count_by_country.values())/float(nr_of_pubs)), '
' , '' , ] elif html_mode == 'html': content = [ '' , '' , ''+title_main+'' , '' , '' , '' , 'Last updated on '+current_time+'' , '
' , '

'+title_for_country_table+'

' , '

' , 'As determined from author affiliations specified in papers.
', 'The count includes authors and co-authors
' , 'There are %s papers with %s different authors in %s countries
'%(nr_of_pubs,nr_of_authors,nr_of_countries), 'The average number of authors per paper is %.1f
'%(sum(count_by_country.values())/float(nr_of_pubs)), '
' , '

' , ] elif html_mode == 'simple': content = [ 'Last updated on '+current_time , '' , title_for_country_table , '' , 'As determined from author affiliations specified in papers' , 'The count includes authors and co-authors' , 'There are %s papers with %s different authors in %s countries'%(nr_of_pubs,nr_of_authors,nr_of_countries), 'The average number of authors per paper is %.1f'%(sum(count_by_country.values())/float(nr_of_pubs)), '' , ] else: say = tiny_bits.say('ReferenceList.%s'%tiny_bits.whoami()) say.die("unrecognized print mode '%s'"%html_mode) nr_of_columns = 4 nr_of_rows, leftover = divmod(nr_of_countries, nr_of_columns) nr_of_rows += leftover > 0 country_name = list_of_top_level_domains() format = '%20s %5s |' if html_mode == 'simple' else '' #'' for irow in range(nr_of_rows): add_content = '' if html_mode == 'simple' else '' for icolumn in range(nr_of_columns): i = icolumn*nr_of_rows+irow if i >= nr_of_countries: continue add_content += format%(country_name[countries[i]],count_by_country[countries[i]]) add_content += '' if html_mode == 'simple' else '' content.append(add_content) if html_mode == 'xinc': content.extend( [ '

' , '' , ] ) else: if html_mode == 'html': content.extend( [ '' , '
' , '

'+title_for_author_table+'

' , '

' , '' , ] ) elif html_mode == 'simple': content.extend( [ '' , title_for_author_table , '' , ] ) nr_of_columns = 4 nr_of_authors = 16 nr_of_rows, leftover = divmod(nr_of_authors, nr_of_columns) nr_of_rows += leftover > 0 for irow in range(nr_of_rows): add_content = '' if html_mode == 'simple' else '' for icolumn in range(nr_of_columns): i = icolumn*nr_of_rows+irow if i >= nr_of_authors: continue add_content += format%(authors[i],count_by_author[authors[i]]) add_content += '' if html_mode == 'simple' else '' content.append(add_content) if html_mode == 'html': content.extend( [ '

' , '

' , '' , '' , ] ) print '\n'.join(content) return def papers_summary(self, keywords): keywords.count_atoms( self ) top_count = keywords.stack['count'] count = { 'all_datasets': top_count.calendar } key_replacement = { 'all_datasets' : '' } html_mode = self.print_attribute['html_mode'] if self.print_attribute.has_key('html_mode') else 'simple' title = ' ('+self.print_attribute['title']+')' if self.print_attribute.has_key('title') else '' title = 'External data papers: papers per year and month'+title current_time = time.strftime('%Y-%b-%d %H:%M:%S',time.gmtime())+' UTC' if html_mode == 'xinc': content = [ '' , 'Last updated on '+current_time+'' , '
' , 'Total number of papers is %s'%keywords.total() , ] elif html_mode == 'html': content = [ '' , '' , ''+title+'' , '' , '' , '' , 'Last updated on '+current_time+'' , '
' , 'Total number of papers is %s'%keywords.total() , '' , ] elif html_mode == 'simple': content = [ 'Last updated on '+current_time , '' , 'Total number of papers is %s'%keywords.total() , '' , ] else: say = tiny_bits.say('ReferenceList.%s'%tiny_bits.whoami()) say.die("unrecognized print mode '%s'"%html_mode) content.extend( table_by_month( count, key_replacement=key_replacement, in_html=html_mode != 'simple' ) ) if html_mode == 'xinc': content.extend( [ '' , ] ) elif html_mode == 'html': content.extend( [ '

' , '' , '' , ] ) print '\n'.join(content) return def authors_by_country_detail(self): world = WorldStack( self ) world.count_atoms( AuthorList( self ) ) cat = world.keys()[0] # Only one: "world" abbrev = world.keys(cat) # Country abbreviations: 'US' country = world.country() # Full country names: 'US' -> 'United States' count_by_country = dict( zip( abbrev, [ world.stack[cat][x]['count'].calendar for x in abbrev ] ) ) title = self.print_attribute['title'] if self.print_attribute.has_key('title') else 'country of origin of authors' print '\n'.join( table_by_month( count_by_country, key_replacement=country, sort_by_count=True, header=[title,'nr of authors'] ) )+'\n' return def papers_by_category_detail(self, keywords, disjoint_cat=False): keywords.count_atoms( self, disjoint_cat ) cat = keywords.keys() count_by_cat = dict( zip( cat, [ keywords.stack[x]['count'].calendar for x in cat ] ) ) title = self.print_attribute['title'] if self.print_attribute.has_key('title') else 'published papers' print '\n'.join( table_by_month( count_by_cat, sort_by_count=True, header=[title,'nr of papers'] ) )+'\n' def papers_by_feature_detail(self, keywords): keywords.count_atoms( self ) cat = keywords.category_by_feature() count_by_feature = dict( zip( cat, [ keywords.stack[cat[x]][x]['count'].calendar for x in cat ] ) ) title = self.print_attribute['title'] if self.print_attribute.has_key('title') else 'published papers' print '\n'.join( table_by_month( count_by_feature, sort_by_count=True, header=[title,'nr of papers'] ) )+'\n' # make author_stack class CountCalendar( object ): def __init__(self, calendar=dict()): self.calendar = deepcopy(calendar) return def __iadd__(self, count): for year in count.calendar: if not self.calendar.has_key(year): self.calendar[year] = dict() for month in count.calendar[year]: if not self.calendar[year].has_key(month): self.calendar[year][month] = 0 self.calendar[year][month] += count.calendar[year][month] return self def __str__(self): s = '' for year in self.calendar: s += '%s |'%year m = [0]*12 for month in self.calendar[year]: m[month-1] = self.calendar[year][month] s += (' %5d'*12+'\n')%tuple(m) if s: s = s[0:-1] return s def copy(self,year=None,month=None): if year == None and month == None: count = self.calendar elif month == None: # year != None count = { year: self.calendar[year] } if year in self.calendar else dict() elif year == None: # month != None count = dict() for year in self.calendar: if self.calendar[year].has_key(month): if not count.has_key(year): count[year] = dict() count[year][month] = self.calendar[year][month] else: # year != None and month != None count[year][month] = self.calendar[year][month] if self.calendar.has_key(year) and self.calendar[year].has_key(month) else dict() return CountCalendar( count ) def total(self, year=None, month=None): return sum( [ self.calendar[year][month] for year in self.calendar for month in self.calendar[year] ] ) class FeatureStack( object ): #+ # NAME: # FeatureStack.__init__ # PURPOSE: # Initialize FeatureStack object # CALLING SEQUENCE: # result = FeatureStack( stack_yaml_file ) # INPUTS: # stack_yaml_file fully-qualified name of yaml file containing # the dictionary of key definitions. # Alternatively, the dict itself can be specified # as input. # OPTIONAL INPUT: # ignore list (?? unused) # OUTPUTS: # result FeatureStack object # PROCEDURE: # result.stack = { # count = # category-1 = { # count = # keyword-a = { # count = # attr-1 = # attr-2 = # }, # keyword-b = { # ... # }, # }, # category-1 = { # ... # }, # } #- def __init__(self, yaml_file, count=CountCalendar() ): say = tiny_bits.say('FeatureStack.%s'%tiny_bits.whoami()) if type( yaml_file ) == type( "str" ): # Read yaml file say.say('read '+yaml_file) handle = open( yaml_file ) features = yaml.load(handle) handle.close() self.yaml_file = yaml_file elif type( yaml_file ) == type( dict() ): # Continue with dict features = yaml_file else: # Neither yaml file, nor dict raise ReferenceError( "%s, cannot init from %s"%(tiny_bits.whoami(), type( yaml_file)) ) if 'count' in features: raise ReferenceError( "%s, 'count' is reserved hash key"%tiny_bits.whoami() ) self.stack = { 'count' : count.copy() } for cat in features: cat_long, cat_short = cat.split(' -> ') if ' -> ' in cat else (cat,'') if 'count' in features[cat]: raise ReferenceError( "%s, 'count' is reserved hash key"%tiny_bits.whoami() ) self.stack[cat_long] = { 'count' : count.copy() } for key in features[cat]: if features[cat][key] == None: features[cat][key] = dict() if 'count' in features[cat][key]: raise ReferenceError( "%s, 'count' is reserved hash key"%tiny_bits.whoami() ) self.stack[cat_long][key] = { 'count' : count.copy() } for item in features[cat][key]: self.stack[cat_long][key][item] = features[cat][key][item] return #+ # NAME: # FeatureStack.__iadd__ # PURPOSE: # Overloads += operator: adds 'count' fields in specified FeatureStack object # CALLING SEQUENCE: # fstack += feature # INPUTS: # fstack FeatureStack object # feature FeatureStack object # OUTPUTS: # fstack stack object with updated 'count' fields #- def __iadd__(self, feature): self.stack['count'] += feature.stack['count'] for cat in feature.keys(): self.stack[cat]['count'] += feature.stack[cat]['count'] for key in feature.keys(cat): self.stack[cat][key]['count'] += feature.stack[cat][key]['count'] return self def __str__(self): category_only = self.category_only if hasattr(self, 'category_only') else False title = self.title if hasattr(self, 'title') else '' keep_zero = self.keep_zero if hasattr(self,'keep_zero') else False keep_attr = self.keep_attr if hasattr(self,'keep_attr') else False count = self.total() s = "Total count: %s%s"%(count,'' if title == '' else ' '+title) if count > 0 or keep_zero: for cat in self.keys(): count = self.total(cat) if count > 0 or keep_zero: s += "\n %-32s -->: %5d"%(cat,count) if not category_only: for key in self.keys(cat): count = self.total(cat,key) if count > 0 or keep_zero: s += "\n %-32s: %5d"%(key,count) if keep_attr: for attr in self.keys(cat,key): s += " %-8s: %-5s"%(attr,self.stack[cat][key][attr]) return s def copy(self,year=None,month=None): features = deepcopy(self) features.stack['count'] = self.stack['count'].copy(year,month) for cat in features.keys(): features.stack[cat]['count'] = self.stack[cat]['count'].copy(year,month) for key in features.keys(cat): features.stack[cat][key]['count'] = self.stack[cat][key]['count'].copy(year,month) return features def category_by_feature(self): # Inverts dictionary to provide the mapping topkey --> category category = dict() for cat in self.keys(): for key in self.keys(cat): category[key] = cat return category #+ # NAME: # FeatureStack.filter_attributes # PURPOSE: # Reduce set of keywords to those matching the specified attributes # CALLING SEQUENCE: # fstack.filter_attributes( attributes ) # INPUTS: # fstack FeatureStack object # attributes string with attribute specification in the form # attr1=val1,attr2=val2 # where val1,val2,... is one of true,false,yes,no # If no value is specified then 'true' is assumed. # OUTPUTS: # fstack updated FeatureStack object containing only those # keywords matching the specified attributes # REMARK: # The 'counts' fields of the input object are retained in the output # object. In most cases it will be necessary to zero the count with # the method 'erase_count'. # PROECDURE: # the attribute names must match the attributes specified as # keys in the stack: # fstack.stack = { category-1 = { # keyword-a = { # attr-1 = # }, # } #- def filter_attributes(self,attributes): if attributes == '': return attrs = dict() for attr in attributes.split(','): name,value = attr.split('=') if '=' in attr else (attr,'true') attrs[ name.strip() ] = True if value.strip().lower() in ('true','yes') else False if attrs: filtered_stack = dict() filtered_stack['count'] = self.stack['count'] n_cat_in = len( self.keys() ) n_key_in = len( self.category_by_feature() ) for cat in self.keys(): for key in self.keys(cat): matches = [ x for x in attrs if self.stack[cat][key].has_key(x) and self.stack[cat][key][x] == attrs[x] ] if len(matches) == len(attrs): if not filtered_stack.has_key(cat): filtered_stack[cat] = { 'count' : self.stack[cat]['count'] } filtered_stack[cat][key] = self.stack[cat][key] self.stack = filtered_stack n_cat_out = len( self.keys() ) n_key_out = len( self.category_by_feature() ) say = tiny_bits.say('FeatureStack%s'%tiny_bits.whoami()) say.say( "%s/%s keywords in %s/%s categories"%(n_key_out,n_key_in,n_cat_out,n_cat_in) ) return def filter_by_regex(self,regex_map): say = tiny_bits.say('FeatureStack.%s'%tiny_bits.whoami()) if regex_map.has_key('key') or regex_map.has_key('cat'): filtered_stack = dict() filtered_stack['count'] = self.stack['count'] n_cat_in = len( self.keys() ) n_key_in = len( self.category_by_feature() ) for cat in self.keys(): if not regex_map.has_key('cat') or re.search(regex_map['cat'],cat): for key in self.keys(cat): if not regex_map.has_key('key') or re.search(regex_map['key'],key): if not filtered_stack.has_key(cat): filtered_stack[cat] = { 'count' : self.stack[cat]['count'] } filtered_stack[cat][key] = self.stack[cat][key] self.stack = filtered_stack n_cat_out = len( self.keys() ) n_key_out = len( self.category_by_feature() ) say = tiny_bits.say('FeatureStack.%s'%tiny_bits.whoami()) say.say( "%s/%s keywords in %s/%s categories"%(n_key_out,n_key_in,n_cat_out,n_cat_in) ) return #+ # NAME: # FeatureStack.keys # PURPOSE: # Return list of keys from stack object, omitting the key 'count' # used internally for counting entries. # CALLING SEQUENCE: # result = fstack.keys([category]) # INPUTS: # fstack FeatureStack object # OUTPUTS: # result list of keys #- def keys(self,cat=None,key=None): lst = (self.stack if cat == None else self.stack[cat] if key == None else self.stack[cat][key]).keys() return sorted( [ x for x in lst if x != 'count' ] ) def total(self,cat=None,key=None,year=None,month=None): return (self.stack if cat == None else self.stack[cat] if key == None else self.stack[cat][key])['count'].total(year,month) def erase_count(self): self.stack['count'] = CountCalendar() for cat in self.keys(): self.stack[cat]['count'] = CountCalendar() for key in self.keys(cat): self.stack[cat][key]['count'] = CountCalendar() return #+ # NAME: # FeatureStack.used # PURPOSE: # Get keywords from 'fstack' that are used in the specified # reference as a FeatureStack object. # CALLING SEQUENCE: # result = fstack.used(atom) # INPUTS: # fstack FeatureStack object # atom Reference object # OUTPUTS: # result FeatureStack object # The 'count' fields are set to 1 if the reference # has keyword(s) matching 'fstack', or to 0 if # there is no match. # PROCEDURE: #- def used(self, atom, disjoint_cat=False): all_categories = self.category_by_feature() all_features = all_categories.keys() atom_features = self.feature_list(atom) common_features = list( set(atom_features) & set(all_features) ) # Cross section used = {} if common_features: for key in common_features: cat = all_categories[key] if not used.has_key(cat): used[cat] = dict() if not used[cat].has_key(key): used[cat][key] = dict() used[ all_categories[key] ][key] = dict( zip( self.keys(cat,key), [ self.stack[cat][key][x] for x in self.keys(cat,key) ] ) ) # If an atom is a in more than one category, count only the first one if disjoint_cat: break year, month = atom.timestamp.get(attr=['year','month']) count = CountCalendar( { year: {month[0]: 1} } ) else: count = CountCalendar() say = tiny_bits.say('FeatureStack.%s'%tiny_bits.whoami()) say.debug( "atom does not match '%s'\n%s"%(','.join(all_features),atom.__repr__()) ) return FeatureStack( used, count=count ) #+ # NAME: # FeatureStack.count_atoms # PURPOSE: # Count number of references matching specified keywords. # CALLING SEQUENCE: # keywords.count_atoms(ref_list) # INPUTS: # keywords FeatureStack object # ref_list ReferenceList object # OUTPUTS: # keywords FeatureStack object with updated 'count' fields. #- def count_atoms(self, refs, disjoint_cat=False): self.erase_count() # Make sure all 'count' fields start out zero for ref in refs.atoms: self += self.used(ref, disjoint_cat) return class KeywordStack( FeatureStack ): #+ # NAME: # KeywordStack.__init__ # PURPOSE: # Initialize keyword_stack object # CALLING SEQUENCE: # result = KeywordStack( stack_yaml_file ) # INPUTS: # stack_yaml_file fully-qualified name of yaml file containing # the dictionary of key definitions. # Alternatively, the dict itself can be specified # as input. # OPTIONAL INPUT: # ignore list (?? unused) # OUTPUTS: # result KeywordStack object # PROCEDURE: #- def __init__(self, yaml_file ): self.feature_list = lambda ref: ref.feature['topkey'] super(KeywordStack,self).__init__(yaml_file) return class WorldStack( FeatureStack ): def __init__(self, ref_list ): country = [ author['country'] for ref in ref_list.atoms for author in ref.feature['author'] if author['country'] != 'unknown' ] country = list( set(country) ) world = { 'world': dict( zip( country, [ dict() for x in range(len(country)) ] ) ) } super(WorldStack,self).__init__(world) self.country_name = list_of_top_level_domains() self.feature_list = lambda author: [ author.feature['details']['country'] if author.feature['details']['country'] != 'unknown' else '' ] return def country(self): tlds = self.keys('world') return dict( zip( tlds, [self.country_name[tld] for tld in tlds] ) ) class AuthorStack( FeatureStack ): def __init__(self, ref_list ): name = [ author.feature['details']['last']+', '+author.feature['details']['first'] for author in AuthorList( ref_list ).atoms ] name = list( set( name ) ) authors = { 'authors': dict( zip( name, [ dict() for x in range(len(name))] ) ) } super(AuthorStack,self).__init__(authors) self.feature_list = lambda author: [ author.feature['details']['last']+', '+author.feature['details']['first'] ] return