1
2
3
4 """
5 String formatting, encoding, etc.
6 """
7
8 __all__ = '''
9 and_join
10 cp1252_to_unicode
11 cp1252_to_unicode_translations
12 dos2unix
13 format
14 html2unicode
15 indent
16 nat_lang_join
17 or_join
18 remove_empty_lines
19 safe_ascii
20 underline
21 unicode2html
22 unicode_special
23 unwrap
24 '''.split()
25
26 import itertools, cgi, re, unittest
27
31
33 """Casts a Unicode string to a regular ASCCII string. This may be
34 lossy."""
35 if isinstance( s, unicode ) and s == str( s ):
36 return str( s )
37 else:
38 return s
39
40 cp1252_to_unicode_translations = [ (u'\x80',u'\u20AC'),
41 (u'\x82',u'\u201A'),
42 (u'\x83',u'\u0192'),
43 (u'\x84',u'\u201E'),
44 (u'\x85',u'\u2026'),
45 (u'\x86',u'\u2020'),
46 (u'\x87',u'\u2021'),
47 (u'\x88',u'\u02C6'),
48 (u'\x89',u'\u2030'),
49 (u'\x8A',u'\u0160'),
50 (u'\x8B',u'\u2039'),
51 (u'\x8C',u'\u0152'),
52 (u'\x8E',u'\u017D'),
53 (u'\x91',u'\u2018'),
54 (u'\x92',u'\u2019'),
55 (u'\x93',u'\u201C'),
56 (u'\x94',u'\u201D'),
57 (u'\x95',u'\u2022'),
58 (u'\x96',u'\u2013'),
59 (u'\x97',u'\u2014'),
60 (u'\x98',u'\u02DC'),
61 (u'\x99',u'\u2122'),
62 (u'\x9A',u'\u0161'),
63 (u'\x9B',u'\u203A'),
64 (u'\x9C',u'\u0153'),
65 (u'\x9E',u'\u017E'),
66 (u'\x9F',u'\u0178') ]
67
69 """Converts characters 0x80 through 0x9f to their proper Unicode
70 equivalents. See
71 U{http://www.intertwingly.net/stories/2004/04/14/i18n.html} for the nice
72 translation table on which this is based."""
73 for a,b in cp1252_to_unicode_translations:
74 x = x.replace(a,b)
75 return x
76
78 """
79 Joins a bunch of lines. L{s} is either a single string (which will be
80 split on newlines into a list of strings) or a list of strings
81 (representing lines).
82 """
83 if isinstance(s, str): s = s.strip().split('\n')
84 return ' '.join( line.strip() for line in s )
85
87 """
88 Prefixes each line in L{s} with L{ind}. L{s} can be either a string (which
89 will be broken up into a list of lines) or a list of strings (treated as
90 lines). Returns a single (indented) string.
91 """
92 if isinstance(s, str): s = s.split('\n')
93 return '\n'.join( ind + line for line in s )
94
96 """
97 If L{amt} is 0, removes all leading whitespace from each line in L{text}.
98 If L{amt} is L{None}, finds the smallest amount of leading whitespace on
99 any non-empty line and removes that many chars from each line. If L{amt}
100 is positive, removes L{amt} chars from each line.
101 """
102 lines = text.split('\n')
103 if amt == 0:
104 return '\n'.join( line.lstrip() for line in lines )
105 def count_indent(line):
106 for i,c in enumerate(line):
107 if not c.isspace(): return i
108 if amt is None:
109 amt = len(text) if text.strip() == '' else \
110 min( count_indent(line) for line in lines if line.strip() != '' )
111 return '\n'.join( line[amt:] for line in lines )
112
114 "Removes all empty lines (or lines of just whitespace)."
115 return '\n'.join( line for line in s.split('\n') if line.strip() != '' )
116
118 """
119 Appends to L{s} a newline and a number of repetitions of L{sep}; the number
120 of repetitions is the length of L{s}.
121 """
122 return s + '\n' + (sep * len(s))
123
125 "Removes carriage returns."
126 return s.replace('\r','')
127
129 "Escape a string as a JavaScript unicode string literal."
130 return ''.join( r'\u%04x' % ord(c) for c in s )
131
132 unicode_special = re.compile(u'[\u0080-\uffff]')
134 "Extends cgi.escape() with escapes for all unicode characters."
135
136 return unicode_special.sub(lambda m: '&#%d;' % ord(m.group()),
137 cgi.escape(s))
138
140 """
141 Sort of a cgi.unescape (doesn't exist). Removes HTML or XML character
142 references and entities from a text string.
143
144 http://effbot.org/zone/re-sub.htm#unescape-html
145 """
146 import htmlentitydefs
147 def fixup(m):
148 text = m.group(0)
149 if text[:2] == "&#":
150
151 try:
152 if text[:3] == "&#x":
153 return unichr(int(text[3:-1], 16))
154 else:
155 return unichr(int(text[2:-1]))
156 except ValueError:
157 pass
158 else:
159
160 try:
161 text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
162 except KeyError:
163 pass
164 return text
165 return re.sub("&#?\w+;", fixup, text)
166
168 """
169 Natural-language join. Join a sequence of strings together into a
170 comma-separated list, but where the last pair is joined with the given
171 special glue. (You may also override the non-last glue, which defaults to
172 a ', '.)
173
174 @param xs: The sequence of strings. This must be a list-like sequence, not
175 a generated one.
176
177 @param last_glue: The string used to join the final pair of elements, when
178 there are more than two elements.
179
180 @param two_glue: The string used to join both elements in a 2-element
181 sequence. Defaults to None, which means to use last_glue.
182
183 @param glue: The string used to join all the other elements.
184 """
185 if len(xs) == 0: return ''
186 elif len(xs) == 1: return xs[0]
187 elif len(xs) == 2: return xs[0] + two_glue + xs[1]
188 else: return glue.join(xs[:-1]) + last_glue + xs[-1]
189
192
195 self.assertEqual( nat_lang_join( 'alpha beta gamma'.split(), ' | ' ),
196 'alpha, beta | gamma' )
197 self.assertEqual( and_join( 'alpha beta gamma'.split() ),
198 'alpha, beta, and gamma' )
199 self.assertEqual( or_join( 'alpha beta'.split() ),
200 'alpha or beta' )
201
202 if __name__ == '__main__':
203 unittest.main()
204