commons.strs

1 # -*- mode: python; tab-width: 4; indent-tabs-mode: nil; py-indent-offset: 4; -*- 2 # vim:ft=python:et:sw=4:ts=4 3 4 """ 5 String formatting, encoding, etc. 6 """ 7 8 __all__ = ''' 9 and_join 10 cp1252_to_unicode 11 cp1252_to_unicode_translations 12 dos2unix 13 format 14 html2unicode 15 indent 16 nat_lang_join 17 or_join 18 remove_empty_lines 19 safe_ascii 20 underline 21 unicode2html 22 unicode_special 23 unwrap 24 '''.split() 25 26 import itertools, cgi, re, unittest 27

28 -def format( *args ):

29 """Formats the args as they would be by the C{print} built-in.""" 30 return ' '.join( itertools.imap( str, args ) )

31

32 -def safe_ascii( s ):

33 """Casts a Unicode string to a regular ASCCII string. This may be 34 lossy.""" 35 if isinstance( s, unicode ) and s == str( s ): 36 return str( s ) 37 else: 38 return s

39 40 cp1252_to_unicode_translations = [ (u'\x80',u'\u20AC'), 41 (u'\x82',u'\u201A'), 42 (u'\x83',u'\u0192'), 43 (u'\x84',u'\u201E'), 44 (u'\x85',u'\u2026'), 45 (u'\x86',u'\u2020'), 46 (u'\x87',u'\u2021'), 47 (u'\x88',u'\u02C6'), 48 (u'\x89',u'\u2030'), 49 (u'\x8A',u'\u0160'), 50 (u'\x8B',u'\u2039'), 51 (u'\x8C',u'\u0152'), 52 (u'\x8E',u'\u017D'), 53 (u'\x91',u'\u2018'), 54 (u'\x92',u'\u2019'), 55 (u'\x93',u'\u201C'), 56 (u'\x94',u'\u201D'), 57 (u'\x95',u'\u2022'), 58 (u'\x96',u'\u2013'), 59 (u'\x97',u'\u2014'), 60 (u'\x98',u'\u02DC'), 61 (u'\x99',u'\u2122'), 62 (u'\x9A',u'\u0161'), 63 (u'\x9B',u'\u203A'), 64 (u'\x9C',u'\u0153'), 65 (u'\x9E',u'\u017E'), 66 (u'\x9F',u'\u0178') ] 67

68 -def cp1252_to_unicode(x):

69 """Converts characters 0x80 through 0x9f to their proper Unicode 70 equivalents. See 71 U{http://www.intertwingly.net/stories/2004/04/14/i18n.html} for the nice 72 translation table on which this is based.""" 73 for a,b in cp1252_to_unicode_translations: 74 x = x.replace(a,b) 75 return x

76

77 -def unwrap(s):

78 """ 79 Joins a bunch of lines. L{s} is either a single string (which will be 80 split on newlines into a list of strings) or a list of strings 81 (representing lines). 82 """ 83 if isinstance(s, str): s = s.strip().split('\n') 84 return ' '.join( line.strip() for line in s )

85

86 -def indent(s, ind = ' '):

87 """ 88 Prefixes each line in L{s} with L{ind}. L{s} can be either a string (which 89 will be broken up into a list of lines) or a list of strings (treated as 90 lines). Returns a single (indented) string. 91 """ 92 if isinstance(s, str): s = s.split('\n') 93 return '\n'.join( ind + line for line in s )

94

95 -def unindent(text, amt = None):

96 """ 97 If L{amt} is 0, removes all leading whitespace from each line in L{text}. 98 If L{amt} is L{None}, finds the smallest amount of leading whitespace on 99 any non-empty line and removes that many chars from each line. If L{amt} 100 is positive, removes L{amt} chars from each line. 101 """ 102 lines = text.split('\n') 103 if amt == 0: 104 return '\n'.join( line.lstrip() for line in lines ) 105 def count_indent(line): 106 for i,c in enumerate(line): 107 if not c.isspace(): return i

108 if amt is None: 109 amt = len(text) if text.strip() == '' else \ 110 min( count_indent(line) for line in lines if line.strip() != '' ) 111 return '\n'.join( line[amt:] for line in lines ) 112

113 -def remove_empty_lines(s):

114 "Removes all empty lines (or lines of just whitespace)." 115 return '\n'.join( line for line in s.split('\n') if line.strip() != '' )

116

117 -def underline(s, sep):

118 """ 119 Appends to L{s} a newline and a number of repetitions of L{sep}; the number 120 of repetitions is the length of L{s}. 121 """ 122 return s + '\n' + (sep * len(s))

123

124 -def dos2unix(s):

125 "Removes carriage returns." 126 return s.replace('\r','')

127

128 -def quotejs(s):

129 "Escape a string as a JavaScript unicode string literal." 130 return ''.join( r'\u%04x' % ord(c) for c in s )

131 132 unicode_special = re.compile(u'[\u0080-\uffff]')

133 -def unicode2html(s):

134 "Extends cgi.escape() with escapes for all unicode characters." 135 # HTML special/Unicode char encoding is in base 10. 136 return unicode_special.sub(lambda m: '&#%d;' % ord(m.group()), 137 cgi.escape(s))

138

139 -def html2unicode(text):

140 """ 141 Sort of a cgi.unescape (doesn't exist). Removes HTML or XML character 142 references and entities from a text string. 143 144 http://effbot.org/zone/re-sub.htm#unescape-html 145 """ 146 import htmlentitydefs 147 def fixup(m): 148 text = m.group(0) 149 if text[:2] == "&#": 150 # character reference 151 try: 152 if text[:3] == "&#x": 153 return unichr(int(text[3:-1], 16)) 154 else: 155 return unichr(int(text[2:-1])) 156 except ValueError: 157 pass 158 else: 159 # named entity 160 try: 161 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]]) 162 except KeyError: 163 pass 164 return text # leave as is

165 return re.sub("&#?\w+;", fixup, text) 166

167 -def nat_lang_join(xs, last_glue, two_glue = None, glue = ', '):

168 """ 169 Natural-language join. Join a sequence of strings together into a 170 comma-separated list, but where the last pair is joined with the given 171 special glue. (You may also override the non-last glue, which defaults to 172 a ', '.) 173 174 @param xs: The sequence of strings. This must be a list-like sequence, not 175 a generated one. 176 177 @param last_glue: The string used to join the final pair of elements, when 178 there are more than two elements. 179 180 @param two_glue: The string used to join both elements in a 2-element 181 sequence. Defaults to None, which means to use last_glue. 182 183 @param glue: The string used to join all the other elements. 184 """ 185 if len(xs) == 0: return '' 186 elif len(xs) == 1: return xs[0] 187 elif len(xs) == 2: return xs[0] + two_glue + xs[1] 188 else: return glue.join(xs[:-1]) + last_glue + xs[-1]

189

190 -def or_join(xs): return nat_lang_join(xs, ', or ', ' or ')

191 -def and_join(xs): return nat_lang_join(xs, ', and ', ' and ')

192

193 -class str_test( unittest.TestCase ):

194 - def test_nat_lang_join( self ):

195 self.assertEqual( nat_lang_join( 'alpha beta gamma'.split(), ' | ' ), 196 'alpha, beta | gamma' ) 197 self.assertEqual( and_join( 'alpha beta gamma'.split() ), 198 'alpha, beta, and gamma' ) 199 self.assertEqual( or_join( 'alpha beta'.split() ), 200 'alpha or beta' )

201 202 if __name__ == '__main__': 203 unittest.main() 204

Source Code for Module commons.strs