liyao20060101 发表于 2018-8-7 09:25:16

Python3 处理 gb18030 乱码

# 修理 gb18030文件  
# 将乱码转化为十六进制字符串,例如:b'\xff' 转为字符串 0xFF
  
# 将不可打印单字节转为十六进制字符串,例如:b'\xff' 转为字符串 0x7F
  
# srcFile 为原始 gb18030文件
  
# dstFile 为修理后的 gb18030文件
  
# explicit 控制是否转换为不可打印字符: explicit 为 False 是不转换(默认),否则转换
  
def RepairGB18030File(srcFile, dstFile, explicit=False):
  with open(srcFile, mode='rb') as fin:
  byteText = fin.read()
  byteLength = len(byteText)
  print('byteLength: %d' % byteLength)
  pos = 0# 位置
  byteList = list()
  # 末尾添加2对\r\n防止pos溢出
  byteText += b'\x0d\x0a\x0d\x0a'
  while pos < byteLength:
  byte1 = bytes(])
  byte2 = bytes(])
  byte3 = bytes(])
  byte4 = bytes(])
  # 单字节汉字(正常)
  if b'\x00' <= byte1 <= b'\x7f':
  pos += 1
  if byte1.decode('gb18030').isprintable(): # 可打印字符
  byteList.append(byte1)
  continue
  if byte1 in (b'\x0d', b'\x0a'): # 换行符
  byteList.append(byte1)
  continue
  if explicit:# 要求转换不可打印字符
  byteNew = (&quot;0x%02X&quot; % ord(byte1)).encode('gb18030')
  byteList.append(byteNew)
  else:# 不要求转换不可打印字符
  byteList.append(byte1)
  # 多字节汉字(双字节或四字节)
  elif b'\x81' <= byte1 <= b'\xfe':
  #双字节(正常)
  if (b'\x40' <= byte2 <= b'\x7e') or (b'\x80' <= byte2 <= b'\xfe'):
  pos += 2
  byteList.extend()
  continue
  #四字节
  if b'\x30' <= byte2 <= b'\x39':
  # 四字节(正常)
  if (b'\x81' <= byte3 <= b'\xfe') or (b'\x30' <= byte4 <= b'\x39'):
  pos += 4
  byteList.extend()
  continue
  # 四字节乱码
  pos += 1#错误的时候只能移动一个字节
  byteNew = (&quot;0x%02X&quot; % ord(byte1)).encode('gb18030')
  byteList.append(byteNew)
  continue
  # 双字节乱码
  #0x00-0x2f、0x7f、0xff
  pos += 1#错误的时候只能移动一个字节
  byteNew = (&quot;0x%02X&quot; % ord(byte1)).encode('gb18030')
  byteList.append(byteNew)
  else:
  # 单字节乱码
  #应该只剩 0x80 和 0xff
  byteNew = (&quot;0x%02X&quot; % ord(byte1)).encode('gb18030')#4个字节
  pos += 1#错误的时候只能移动一个字节
  byteList.append(byteNew)
  repairedText = b''.join(byteList).decode('gb18030')
  with open(dstFile, mode='w', encoding='gb18030') as fout:
  fout.write(repairedText)
页: [1]
查看完整版本: Python3 处理 gb18030 乱码