for sheetName in inwb.get_sheet_names():
if not sheetName.isdigit():
continue
sheet = inwb[sheetName]
得到工作表之后,就是按列和行处理了。openpyxl会根据工作表里实际有数据的区域来确定行数和列数,获取行和列的方法是sheet.rows和sheet.columns,它们都可以像list一样用。比如,如果我想跳过数据少于2列的表,可以写
if len(sheet.columns) < 2:
continue
如果我想获取这个工作表的前两列,可以写
# regex pattern matching all ascii characters
asciiPattern = ur'[%s]+' % ''.join(chr(i) for i in range(32, 127))
# regex pattern matching all common Chinese characters and seporators
chinesePattern = ur'[\u4e00-\u9fff. %s]+' % (''.join(seps))
英文就用ASCII可打印字符的范围替代,常见中文字符的范围是\u4e00-\u9fff,那个seps是前面提到过的超出GBK范围的一些字符。 除了简单的分割,我还需要处理只有中文名没有英文名、只有英文名没有中文名等情况,判断逻辑如下:
def split_name(name):
"""Split [English name, Chinese name].
If one of them is missing, None will be returned instead.
Usage:
engName, chName = split_name(name)
"""
matches = re.match('(%s) (%s)' % (asciiPattern, chinesePattern), name)
if matches: # English name + Chinese name
return matches.group(1).strip(), matches.group(2).strip()
else:
matches = re.findall('(%s)' % (chinesePattern), name)
matches = ''.join(matches).strip()
if matches: # Chinese name only
return None, matches
else: # English name only
matches = re.findall('(%s)' % (asciiPattern), name)
return ''.join(matches).strip(), None
得到了中文名之后,我需要分割成姓和名,因为任务要求不需要把姓名分割得很明确,我就按照常见的中文名姓名分割方式来分——两个字or三个字的第一个字是姓,四个字的前两个字是姓,名字带分隔符的(少数民族名字)分隔符前是姓(这里用到了前面的get_clean_ch_string函数来移除分隔符),名字再长一些又不带分割符的,假设整个字符串都是名字。(注意英语的first name 指的是名,last name指的是姓,2333)
def split_ch_name(chName):
"""Split the Chinese name into first name and last name.
* If the name is XY or XYZ, X will be returned as the last name.
* If the name is WXYZ, WX will be returned as the last name.
* If the name is ...WXYZ, the whole name will be returned
as the last name.
* If the name is ..ABC * XYZ..., the part before the seperator
will be returned as the last name.
Usage:
chFirstName, chLastName = split_ch_name(chName)
"""
if len(chName) < 4: # XY or XYZ
chLastName = chName[0]
chFirstName = chName[1:]
elif len(chName) == 4: # WXYZ
chLastName = chName[:2]
chFirstName = chName[2:]
else: # longer
cleanName = get_clean_ch_string(chName)
nameParts = cleanName.split()
print u' '.join(nameParts)
if len(nameParts) < 2: # ...WXYZ
return None, nameParts[0]
chLastName, chFirstName = nameParts[:2] # ..ABC * XYZ...
return chFirstName, chLastName