Package commons :: Module strs
[hide private]
[frames] | no frames]

Source Code for Module commons.strs

  1  # -*- mode: python; tab-width: 4; indent-tabs-mode: nil; py-indent-offset: 4; -*- 
  2  # vim:ft=python:et:sw=4:ts=4 
  3   
  4  """ 
  5  String formatting, encoding, etc. 
  6  """ 
  7   
  8  __all__ = ''' 
  9  and_join 
 10  cp1252_to_unicode 
 11  cp1252_to_unicode_translations 
 12  dos2unix 
 13  format 
 14  html2unicode 
 15  indent 
 16  nat_lang_join 
 17  or_join 
 18  remove_empty_lines 
 19  safe_ascii 
 20  underline 
 21  unicode2html 
 22  unicode_special 
 23  unwrap 
 24  '''.split() 
 25   
 26  import itertools, cgi, re, unittest 
 27   
28 -def format( *args ):
29 """Formats the args as they would be by the C{print} built-in.""" 30 return ' '.join( itertools.imap( str, args ) )
31
32 -def safe_ascii( s ):
33 """Casts a Unicode string to a regular ASCCII string. This may be 34 lossy.""" 35 if isinstance( s, unicode ) and s == str( s ): 36 return str( s ) 37 else: 38 return s
39 40 cp1252_to_unicode_translations = [ (u'\x80',u'\u20AC'), 41 (u'\x82',u'\u201A'), 42 (u'\x83',u'\u0192'), 43 (u'\x84',u'\u201E'), 44 (u'\x85',u'\u2026'), 45 (u'\x86',u'\u2020'), 46 (u'\x87',u'\u2021'), 47 (u'\x88',u'\u02C6'), 48 (u'\x89',u'\u2030'), 49 (u'\x8A',u'\u0160'), 50 (u'\x8B',u'\u2039'), 51 (u'\x8C',u'\u0152'), 52 (u'\x8E',u'\u017D'), 53 (u'\x91',u'\u2018'), 54 (u'\x92',u'\u2019'), 55 (u'\x93',u'\u201C'), 56 (u'\x94',u'\u201D'), 57 (u'\x95',u'\u2022'), 58 (u'\x96',u'\u2013'), 59 (u'\x97',u'\u2014'), 60 (u'\x98',u'\u02DC'), 61 (u'\x99',u'\u2122'), 62 (u'\x9A',u'\u0161'), 63 (u'\x9B',u'\u203A'), 64 (u'\x9C',u'\u0153'), 65 (u'\x9E',u'\u017E'), 66 (u'\x9F',u'\u0178') ] 67
68 -def cp1252_to_unicode(x):
69 """Converts characters 0x80 through 0x9f to their proper Unicode 70 equivalents. See 71 U{http://www.intertwingly.net/stories/2004/04/14/i18n.html} for the nice 72 translation table on which this is based.""" 73 for a,b in cp1252_to_unicode_translations: 74 x = x.replace(a,b) 75 return x
76
77 -def unwrap(s):
78 """ 79 Joins a bunch of lines. L{s} is either a single string (which will be 80 split on newlines into a list of strings) or a list of strings 81 (representing lines). 82 """ 83 if isinstance(s, str): s = s.strip().split('\n') 84 return ' '.join( line.strip() for line in s )
85
86 -def indent(s, ind = ' '):
87 """ 88 Prefixes each line in L{s} with L{ind}. L{s} can be either a string (which 89 will be broken up into a list of lines) or a list of strings (treated as 90 lines). Returns a single (indented) string. 91 """ 92 if isinstance(s, str): s = s.split('\n') 93 return '\n'.join( ind + line for line in s )
94
95 -def unindent(text, amt = None):
96 """ 97 If L{amt} is 0, removes all leading whitespace from each line in L{text}. 98 If L{amt} is L{None}, finds the smallest amount of leading whitespace on 99 any non-empty line and removes that many chars from each line. If L{amt} 100 is positive, removes L{amt} chars from each line. 101 """ 102 lines = text.split('\n') 103 if amt == 0: 104 return '\n'.join( line.lstrip() for line in lines ) 105 def count_indent(line): 106 for i,c in enumerate(line): 107 if not c.isspace(): return i
108 if amt is None: 109 amt = len(text) if text.strip() == '' else \ 110 min( count_indent(line) for line in lines if line.strip() != '' ) 111 return '\n'.join( line[amt:] for line in lines ) 112
113 -def remove_empty_lines(s):
114 "Removes all empty lines (or lines of just whitespace)." 115 return '\n'.join( line for line in s.split('\n') if line.strip() != '' )
116
117 -def underline(s, sep):
118 """ 119 Appends to L{s} a newline and a number of repetitions of L{sep}; the number 120 of repetitions is the length of L{s}. 121 """ 122 return s + '\n' + (sep * len(s))
123
124 -def dos2unix(s):
125 "Removes carriage returns." 126 return s.replace('\r','')
127
128 -def quotejs(s):
129 "Escape a string as a JavaScript unicode string literal." 130 return ''.join( r'\u%04x' % ord(c) for c in s )
131 132 unicode_special = re.compile(u'[\u0080-\uffff]')
133 -def unicode2html(s):
134 "Extends cgi.escape() with escapes for all unicode characters." 135 # HTML special/Unicode char encoding is in base 10. 136 return unicode_special.sub(lambda m: '&#%d;' % ord(m.group()), 137 cgi.escape(s))
138
139 -def html2unicode(text):
140 """ 141 Sort of a cgi.unescape (doesn't exist). Removes HTML or XML character 142 references and entities from a text string. 143 144 http://effbot.org/zone/re-sub.htm#unescape-html 145 """ 146 import htmlentitydefs 147 def fixup(m): 148 text = m.group(0) 149 if text[:2] == "&#": 150 # character reference 151 try: 152 if text[:3] == "&#x": 153 return unichr(int(text[3:-1], 16)) 154 else: 155 return unichr(int(text[2:-1])) 156 except ValueError: 157 pass 158 else: 159 # named entity 160 try: 161 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) 162 except KeyError: 163 pass 164 return text # leave as is
165 return re.sub("&#?\w+;", fixup, text) 166
167 -def nat_lang_join(xs, last_glue, two_glue = None, glue = ', '):
168 """ 169 Natural-language join. Join a sequence of strings together into a 170 comma-separated list, but where the last pair is joined with the given 171 special glue. (You may also override the non-last glue, which defaults to 172 a ', '.) 173 174 @param xs: The sequence of strings. This must be a list-like sequence, not 175 a generated one. 176 177 @param last_glue: The string used to join the final pair of elements, when 178 there are more than two elements. 179 180 @param two_glue: The string used to join both elements in a 2-element 181 sequence. Defaults to None, which means to use last_glue. 182 183 @param glue: The string used to join all the other elements. 184 """ 185 if len(xs) == 0: return '' 186 elif len(xs) == 1: return xs[0] 187 elif len(xs) == 2: return xs[0] + two_glue + xs[1] 188 else: return glue.join(xs[:-1]) + last_glue + xs[-1]
189
190 -def or_join(xs): return nat_lang_join(xs, ', or ', ' or ')
191 -def and_join(xs): return nat_lang_join(xs, ', and ', ' and ')
192
193 -class str_test( unittest.TestCase ):
194 - def test_nat_lang_join( self ):
195 self.assertEqual( nat_lang_join( 'alpha beta gamma'.split(), ' | ' ), 196 'alpha, beta | gamma' ) 197 self.assertEqual( and_join( 'alpha beta gamma'.split() ), 198 'alpha, beta, and gamma' ) 199 self.assertEqual( or_join( 'alpha beta'.split() ), 200 'alpha or beta' )
201 202 if __name__ == '__main__': 203 unittest.main() 204