# filename: titlecase.awk # author: Eric Pement - pemente@northpark.edu, eric.pement@moody.edu # version: 1.21 # date: 3 January 2003 # requires: GNU awk # purpose: To convert UPPER/lower/MiXeD case strings to "Title Case" # # credit: This function was inspired by totitle.awk by M. Joshua Ryan, # which had a similar purpose but not enough exception handling. # I completely rewrote it, added debugging and other features. # # function: titlecase("CHANGE TO TITLE CASE") --> "Change to Title Case" # # Features: # This function will compress whitespace if a second parameter is passed to # the function. It is sufficient to use a positive number: # titlecase(string,1) # # Debugging/diagnostic output will be printed to stdout if "-v debug=1" is # added as a command-line argument to gawk. # # This function tries to implement the "Title Case" constructs specified in # the APA Style Manual and the Chicago Manual of Style. Instead of merely # capitalizing the first letter of each word and setting everything else in # lowercase, this function implements the following conditions: # # - Conjunctions, articles, and prepositions are set lowercase, UNLESS they # are the first word of the string or the first word after a colon, a # question mark, or an exclamation point. # - Compass points (NE, SW, etc.) are set in solid caps. # - Roman numerals (II, IV, VII, IX, etc.) are set in solid caps. # - Certain abbreviations are always capitalized (AIDS, ASCII, NT, USA, etc.) # - Names beginning with D' or O' are set as D'Arcy, O'Reilly, etc. # - Hyphenated strings receive internal caps (Smith-Williams, Twenty-Two) # - Contractions such as I'll, You've, Don't, etc. are handled properly # - Degrees such as Ph.D., M.Div., etc. are properly capitalized # # CHANGELOG: # 1.21 - explicitly declared a field separator for split() function BEGIN { #-----ABBREVIATIONS TO BE SET IN LOWERCASE----- # Articles and conjunctions articles = "a an the and but for or so " # Prepositions # Note: This list will not be perfect, since some prepositions require # grammatical analysis which is beyond the capacity of this script. # Omitted: over (=finished), under, through, before, after preps = "against at between by in into of on to upon " # Build array of words to be set lowercased split(articles preps, keep_lower, " ") #-----ABBREVIATIONS TO BE SET IN SOLID CAPS----- # Compass points compass = "NE NW SE SW " # Religious references - add to only as needed religious = "OT NT LXX YHWH BC BCE AD CE MBI KJV ASV NIV NASB TEV RSV NRSV " # State names - add to only as needed # Never add abbreviations which may be whole English words (hi, in, oh, # ok, me, us) or part of hyphenated words (al-, co-, de-). states = "CA DC IL MI NV NJ NY USA " # Other abbreviations - add only as needed other = "AIDS ASCII CD DHTML DNA FBI GNU GPL IBM IRS ISBN ISSN PHP ROM SSN " # build array of words to keep uppercase split(compass religious states other, keep_upper, " ") } function titlecase(string,x) { # Initialize variables a = ""; # a is/will be the string ALREADY converted b = string; # b is the rest of the string, so that (string = a b) compress = x; # optional compression argument if (compress) { # Compress spaces or tabs if 2nd argument passed gsub(/[ \t]+/, " ", b) if (debug) print "DIAG: Compress argument passed to function call" } b = toupper(b) # Capitalize everything for ease of matching do { hit = 0; # Initialize for later use # pos is the position of the NEXT punctuation mark (except apostrophe) # after the current word. If this is the last word in b, pos will be 0. # match() automatically sets RLENGTH pos = match(b, /[^A-Z']+/) if (pos > 0) word = substr(b, 1, pos + RLENGTH - 1) else word = b # 1st char of current word head = substr(b, 1, 1) # tail of current word if (pos > 0) tail = substr(b, 2, pos + RLENGTH - 2) else tail = substr(b, 2) # shorten the rest of the string b = substr(b, pos + RLENGTH ) #----Words to keep uppercase---- # Case 1: abbreviations from the keep_upper array. for (var in keep_upper) { hit = match(word, "^" keep_upper[var] "\\>") if ( hit > 0 ) { if (debug) print "DIAG: Match UC on [" keep_upper[var] "] in string [" word "]"; break; } } # Case 2: Roman numerals # Note: this match cannot distinguish between LIV (54 in Roman numerals) # and a personal name like "Liv Ullman". The Roman numerals C (100), # D (500), and M (1000) are omitted to avoid false matches on words like # civil, did, dim, lid, mid-, mild, Vic, etc. Most uses of Roman numerals # in titles stays in the lower ranges, such as "Vol. II" or "Pt. XXIV". if ( hit == 0 && match(word, /^[IVXL]+\>/) ) { hit = 1 # But we can undo I'd, I'll, I'm, I've and Ill. if (match(word,/^I'|ILL\>/)) hit = 0 if (debug && hit == 1) print "DIAG: Match on Roman numerals in [" word "]" } #----Words to be set in MiXed case---- # Case 3: Names like D'Arcy or O'Reilly if ( hit == 0 && match(word, /^[DO]'[A-Z]/) ) { if (debug) print "DIAG: Match on mixed case: " word word = substr(word,1,3) tolower(substr(word,4)) hit = 1 } # Case 4: Names like MacNeil or McDonald if ( hit == 0 && match(word,/^MA?C[B-DF-HJ-NP-TV-Z]/) ) { if (debug) print "DIAG: Match on MacX: " substr(word,1,1) "-" \ tolower(substr(word,2,RLENGTH-2)) "-" substr(word,RLENGTH,1) "-" \ tolower(substr(word,RLENGTH+1)) word = substr(word,1,1) tolower(substr(word,2,RLENGTH-2)) \ substr(word,RLENGTH,1) tolower(substr(word,RLENGTH+1)) hit = 1 } #----Words to set in lowercase---- # Case 5: articles, conjunctions, prepositions from the keep_lower array if (hit == 0) { for (var2 in keep_lower) { hit = sub("^" toupper(keep_lower[var2]) "\\>", keep_lower[var2], word); if ( hit > 0 ) { if (debug) print "DIAG: Match LC on [" keep_lower[var2] "] in string [" word "]"; break; } } } #----Default: Capitalize everything else normally---- if (hit > 0) a = a word else a = a toupper(head) tolower(tail) } while (pos > 0); # Everything should be converted now. # Double exception 1: Set 1st word of string in Cap case # Need to handle potential internal single/double quotes like # "A Day in the Life" or 'On the Waterfront' match(a, /[A-Za-z]/) a = toupper(substr(a,1,RSTART)) substr(a,RSTART+1) # Double exception 2: Set 1st word after a colon, question mark or # exclamation point in title case. This kludge handles multiple colons, # question marks, etc. on the line. \a is the BEL or CTRL-G character. done = gensub(/([:?!][^a-zA-Z]*)([a-zA-Z])/,"\\1\a\\2", "g", a) while (match(done,/\a/)) { beg = substr(done,1,RSTART-1) cap = toupper(substr(done,RSTART+1,1)) end = substr(done,RSTART+2) done = beg cap end } return done } #---end of awk script---