# Written 27 Jan 2004 by Andy Jacobson (andyj@splash.princeton.edu). # $Id: add_labels.awk,v 1.2 2004/01/28 18:40:29 arj Exp $ # Time-stamp: # # This script assigns labels to EndNote records. It is very specific # to my needs and is unlikely to be useful to others without # modifications. I simply want to show that awk can be used to do # arbitrary manipulation of EndNote libraries, and share the knowledge # I gained in the process of doing so. # # Run this script by doing "awk -f add_labels.awk filename.txt". This # assumes that the script is called "add_labels.awk" and that the # tab-delimited output from EndNote is called "filename.txt". # # The labels generated by this script are intended to be unique. I # use them to name the electronic copies of manuscripts that I keep, and # to specify temporary citations in documents I write. The labels are # of the form lastnameYYx, where: # # "lastname" is the lowercased last name of the first author # # "YY" is the non-y2k compliant two-digit year abbreviation: e.g. 04 # for 1904 or 2004. # # "x", the trailing digit is a character (a,b,c,d, ...) that ensures # uniqueness. The order of the letter is order of occurrence to # me, not chronological. So smith95a is the first paper by some # Smith in 1995 that I have come across. smith95b is the second, # regardless of whether it is the same Smith or whether it was # published before smith95a. # # The script operates on tab-delimited output from EndNote, and # creates tab-delimited input for EndNote. Note that these are # slightly different formats! It is vital that carriage returns be # stripped from the EndNote library before exporting to tab-delimited # output. This is easily accomplished by doing a first output-input # cycle using the "EndNote export" style. For explicit instructions # on this, see Robert W. Gear's guide at # http://www.gordonmckenzie.co.uk/academic/endnote/ # # I used EndNote 7.0 on OS X 10.3.2 when writing this script. The # default tab-delimited format operating on my library has 38 fields # in the output, the last of which is the label field and the first # of which is the record number. EndNote apparently does not want the # record number in the processed, ready-to-be-imported file. It also # requires two header lines, the details of which may vary from # platform to platform, and may even depend on what data are in the # library being exported. Certainly if the tab-delimited output # format has been changed (via the edit output style interface), this # script will have to be changed to match the output. # # Awk scripts give instructions to operate on each record (line) of a # text file. Looping through the lines of the input file is implicit. # In addition to the instructions for this central processing loop, # awk allows one to specify BEGIN and END blocks which do extra # processing before the loop and after the loop respectively. In this # script we use a BEGIN block, but no END block. BEGIN { # Tell awk that the input field separator is a tab and the input # record separator is a carriage return. The carriage return as # line delimiter is Mac-specific; Windows uses something else. FS = "\t" RS = "\r" # Now request the same delimiters for the output. OFS = "\t" ORS = "\r" # Here we make an array of all existing labels, cl. This is # a list against which new labels will be checked so that uniqueness # can be ensured. This involves running once through the input file # (supplied on the command line). Reading from it (in the while line) # is sufficient to open the file, whereas closing the file must be # done explicitly. nlabs = 0 while( getline < ARGV[ARGC-1] ) { if($38 !~ /^$/) { ++nlabs cl[nlabs]= $38 } } close(ARGV[ARGC-1]) # create output file name if(match(ARGV[ARGC-1],".txt$")) { fnout=ARGV[ARGC-1] sub(".txt",".out.txt",fnout) } else { fnout=sprintf("%s.out",ARGV[ARGC-1]) } # "letters" becomes an array with all the letters of the alphabet split("abcdefghijklmnopqrstuvwxyz",letters,"") # Write the two required header lines to the output file print "*Generic" > fnout print "Reference Type Author Year Title Secondary Author Secondary Title Place Published Publisher Volume Number of Volumes Number Pages Section Tertiary Author Tertiary Title Edition Date Type of Work Subsidiary Author Short Title Alternate Title ISBN/ISSN Original Publication Reprint Edition Reviewed Item Custom 1 Custom 2 Custom 3 Custom 4 Custom 5 Custom 6 Accession Number Call Number Author Address Image Caption Label" > fnout } # The BEGIN section is over. Now the main section begins, in which # processing implicitly loops over each line of the input file. $38 ~ /^$/ { # this is like an "if" statement, and it means # "do the following if field 38 (the label) is empty. core = label($3,$4) # call the label function (below) to make the # core of the output label string # We heuristically append "a" to the core. If there is already a # label like that, proceed sequentially through the alphabet and try # another one. Process will break if there is already a full set of # 26 labels for the target core (modify to use double letters as the # suffix?). ilet = 1 while(1) { letter = letters[ilet] mayb = sprintf("%s%c",core,letter) for(item in cl) { unique = 1 if(cl[item]==mayb) { unique = 0 ++ilet; break; } # if } # for if(unique) { $38 = mayb # change field 38 of the current record ++nlabs cl[nlabs] = $38 # add this new label to the list of current labels # so that new labels are also unique. break; # get us out of the while loop } #if } # while } # if $38 is empty # Output the current record. Note that we omit $1, the first field, # which in my case is the record number. EndNote apparently does # not want that field in an input file. Thus the awkward list of fields # 2 through 38. Otherwise we could use the shorthand $0 which indicates # "all fields". {print $2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$17,$18,$19,$20,$21,$22,$23,$24,$25,$26,$27,$28,$29,$30,$31,$32,$33,$34,$35,$36,$37,$38 > fnout} # That's it for the core section, no more processing is required! # Take the full list of all authors and the 4-digit year, # and extract the lowercased version of the first author's # last name. Assumes that authors are separated by # semicolons (EndNote also allows a double backslash, I think), # and that the last name is the last element in each space-delimited # author name. Name suffixes like "Jr." or "II" may cause this # to fail, depending on how they are written in the library. function label(namelist,year) { split(namelist,nms,";") z=split(nms[1],nm," ") # nm gets the output array here, and z # the number of elements. lb=sprintf("%s%s",tolower(nm[z]),substr(year,3,2)) return lb }