Python 将二进制文件读入结构体
声明:本页面是StackOverFlow热门问题的中英对照翻译,遵循CC BY-SA 4.0协议,如果您需要使用它,必须同样遵循CC BY-SA许可,注明原文地址和作者信息,同时你必须将它归于原作者(不是我):StackOverFlow
原文地址: http://stackoverflow.com/questions/14215715/
Warning: these are provided under cc-by-sa 4.0 license. You are free to use/share it, But you must attribute it to the original authors (not me):
StackOverFlow
Reading a binary file into a struct
提问by kasperhj
I have a binary file with a known format/structure.
我有一个已知格式/结构的二进制文件。
How do I read all the binary data in to an array of the structure?
如何将所有二进制数据读入结构数组?
Something like (in pseudo code)
类似的东西(在伪代码中)
bytes = read_file(filename)
struct = {'int','int','float','byte[255]'}
data = read_as_struct(bytes, struct)
data[1]
>>> 10,11,10.1,Arr[255]
EDIT:
编辑:
Solution so far:
到目前为止的解决方案:
data = []
fmt = '=iiiii256i'
fmt_s = '=iiiii'
fmt_spec = '256i'
struct_size = struct.calcsize(fmt)
for i in range(struct_size, len(bytes)-struct_size, struct_size):
dat1= list(struct.unpack(fmt_s, bytes[i-struct_size:i-1024]))
dat2= list(struct.unpack(fmt_spec, bytes[i-1024:i]))
dat1.append(dat2)
data.append(dat1)
采纳答案by Martijn Pieters
Use the structmodule; you need to define the types in a string format documented with that library:
使用struct模块;您需要以该库记录的字符串格式定义类型:
struct.unpack('=HHf255s', bytes)
The above example expects native byte-order, two unsigned shorts, a float and a string of 255 characters.
上面的示例需要本机字节顺序、两个无符号短整型、一个浮点数和一个 255 个字符的字符串。
To loop over an already fully read bytesstring, I'd use itertools; there is a handy grouper recipethat I've adapter here:
要遍历已经完全读取的bytes字符串,我会使用itertools; 我这里有一个方便的石斑鱼食谱:
from itertools import izip_longest, imap
from struct import unpack, calcsize
fmt_s = '=5i'
fmt_spec = '=256i'
size_s = calcsize(fmt_s)
size = size_s + calcsize(fmt_spec)
def chunked(iterable, n, fillvalue=''):
args = [iter(iterable)] * n
return imap(''.join, izip_longest(*args, fillvalue=fillvalue))
data = [unpack(fmt_s, section[:size_s]) + (unpack(fmt_spec, section[size_s:]),)
for section in chunked(bytes, size)]
This produces tuples rather than lists, but it's easy enough to adjust if you have to:
这会生成元组而不是列表,但如果您必须这样做,则很容易进行调整:
data = [list(unpack(fmt_s, section[:size_s])) + [list(unpack(fmt_spec, section[size_s:]))]
for section in chunked(bytes, size)]
回答by ray_linn
Add comments
添加评论
import struct
First just read the binary into an array
首先将二进制读入数组
mbr = file('mbrcontent', 'rb').read()
So you can just fetch some piece of the the array
所以你可以只获取数组的一部分
partition_table = mbr[446:510]
and then unpack it as an integer
然后将其解包为整数
signature = struct.unpack('<H', mbr[510:512])[0]
a more complex example
一个更复杂的例子
little_endian = (signature == 0xaa55) # should be True
print "Little endian:", little_endian
PART_FMT = (little_endian and '<' or '>') + (
"B" # status (0x80 = bootable (active), 0x00 = non-bootable)
# CHS of first block
"B" # Head
"B" # Sector is in bits 5; bits 9 of cylinder are in bits 7-6
"B" # bits 7-0 of cylinder
"B" # partition type
# CHS of last block
"B" # Head
"B" # Sector is in bits 5; bits 9 of cylinder are in bits 7-6
"B" # bits 7-0 of cylinder
"L" # LBA of first sector in the partition
"L" # number of blocks in partition, in little-endian format
)
PART_SIZE = 16
fmt_size = struct.calcsize(PART_FMT)
# sanity check expectations
assert fmt_size == PART_SIZE, "Partition format string is %i bytes, not %i" % (fmt_size, PART_SIZE)
def cyl_sector(sector_cyl, cylinder7_0):
sector = sector_cyl & 0x1F # bits 5-0
# bits 7-6 of sector_cyl contain bits 9-8 of the cylinder
cyl_high = (sector_cyl >> 5) & 0x03
cyl = (cyl_high << 8) | cylinder7_0
return sector, cyl
#I have corrected the indentation, but the change is refused because less than 6 characters, so I am adding this useful comment.
for partition in range(4):
print "Partition #%i" % partition,
offset = PART_SIZE * partition
(status, start_head, start_sector_cyl, start_cyl7_0, part_type, end_head, end_sector_cyl, end_cyl7_0,
lba, blocks ) = struct.unpack( PART_FMT,partition_table[offset:offset + PART_SIZE])
if status == 0x80:
print "Bootable",
elif status:
print "Unknown status [%s]" % hex(status),
print "Type=0x%x" % part_type
start = (start_head,) + cyl_sector(start_sector_cyl, start_cyl7_0)
end = (end_head,) + cyl_sector(end_sector_cyl, end_cyl7_0)
print " (Start: Heads:%i\tCyl:%i\tSect:%i)" % start
print " (End: Heads:%i\tCyl:%i\tSect:%i)" % end
print " LBA:", lba
print " Blocks:", blocks
回答by martineau
Actually it looks like you're trying to read a list (or array) of structures from the file. The idiomatic way to do this in Python is use the structmodule and call struct.unpack()in a loop—either a fixed number of times if you know the number of them in advance, or until end-of-file is reached—and store the results in a list. Here's an example of the latter:
实际上,您似乎正在尝试从文件中读取结构列表(或数组)。在 Python 中执行此操作的惯用方法是使用struct模块并struct.unpack()在循环中调用(如果您事先知道它们的数量,则调用固定次数,或者直到到达文件结尾)并将结果存储在一个list. 下面是后者的一个例子:
import struct
struct_fmt = '=5if255s' # int[5], float, byte[255]
struct_len = struct.calcsize(struct_fmt)
struct_unpack = struct.Struct(struct_fmt).unpack_from
results = []
with open(filename, "rb") as f:
while True:
data = f.read(struct_len)
if not data: break
s = struct_unpack(data)
results.append(s)
The same results can be also obtained slightly more concisely using a list comprehensionalong with a short generator functionhelper (i.e. read_chunks()below):
使用列表推导式和简短的生成器函数助手(即read_chunks()下面)也可以更简洁地获得相同的结果:
def read_chunks(f, length):
while True:
data = f.read(length)
if not data: break
yield data
with open(filename, "rb") as f:
results = [struct_unpack(chunk) for chunk in read_chunks(f, struct_len)]
Update
更新
You don't, in fact, need to explicitly define a helper function as shown above because you can use Python's built-in iter()function to dynamically create the needed iteratorobject in the list comprehension itself like so:
实际上,您不需要显式定义如上所示的辅助函数,因为您可以使用 Python 的内置iter()函数在列表推导本身中动态创建所需的迭代器对象,如下所示:
from functools import partial
with open(filename, "rb") as f:
results = [struct_unpack(chunk) for chunk in iter(partial(f.read, struct_len), b'')]
回答by xielongen
import os, re
import functools
import ctypes
from ctypes import string_at, byref, sizeof, cast, POINTER, pointer, create_string_buffer, memmove
import numpy as np
import pandas as pd
class _StructBase(ctypes.Structure):
__type__ = 0
_fields_ = []
@classmethod
def Offsetof(cls, field):
pattern = '(?P<field>\w+)\[(?P<idx>\d+)\]'
mat = re.match(pattern, field)
if mat:
fields = dict(cls.Fields())
f = mat.groupdict()['field']
idx = mat.groupdict()['idx']
return cls.Offsetof(f) + int(idx) * ctypes.sizeof(fields[field])
else:
return getattr(cls, field).offset
@classmethod
def DType(cls):
map = {
ctypes.c_byte: np.byte,
ctypes.c_ubyte: np.ubyte,
ctypes.c_char: np.ubyte,
ctypes.c_int8: np.int8,
ctypes.c_int16: np.int16,
ctypes.c_int32: np.int32,
ctypes.c_int64: np.int64,
ctypes.c_uint8: np.uint8,
ctypes.c_uint16: np.uint16,
ctypes.c_uint32: np.uint32,
ctypes.c_uint64: np.uint64,
ctypes.c_float: np.float32,
ctypes.c_double: np.float64,
}
res = []
for k, v in cls.Fields():
if hasattr(v, '_length_'):
if v._type_ != ctypes.c_char:
for i in range(v._length):
res.append((k, map[v], cls.Offsetof(k)))
else:
res.append((k, 'S%d' % v._length_, cls.Offsetof(k)))
else:
res.append((k, map[v], cls.Offsetof(k)))
res = pd.DataFrame(res, columns=['name', 'format', 'offset'])
return np.dtype({
'names': res['name'],
'formats': res['format'],
'offsets': res['offset'],
})
@classmethod
def Attr(cls):
fields = cls._fields_
res = []
for attr, tp in fields:
if str(tp).find('_Array_') > 0 and str(tp).find('char_Array_') < 0:
for i in range(tp._length_):
res.append((attr + '[%s]' % str(i), tp._type_))
else:
res.append((attr, tp))
return res
@classmethod
def Fields(cls, notype=False):
res = [cls.Attr()]
cur_cls = cls
while True:
cur_cls = cur_cls.__bases__[0]
if cur_cls == ctypes.Structure:
break
res.append(cur_cls.Attr())
if notype:
return [k for k, v in functools.reduce(list.__add__, reversed(res), [])]
else:
return functools.reduce(list.__add__, reversed(res), [])
@classmethod
def size(cls):
return sizeof(cls)
@classmethod
def from_struct_binary(cls, path, max_count=2 ** 32, decode=True):
print(os.path.getsize(path), cls.size())
assert os.path.getsize(path) % cls.size() == 0
size = os.path.getsize(path) // cls.size()
size = min(size, max_count)
index = range(size)
array = np.fromfile(path, dtype=cls.DType(), count=size)
df = pd.DataFrame(array, index=index)
for attr, tp in eval(str(cls.DType())):
if re.match('S\d+', tp) is not None and decode:
try:
df[attr] = df[attr].map(lambda x: x.decode("utf-8"))
except:
df[attr] = df[attr].map(lambda x: x.decode("gbk"))
return df
class StructBase(_StructBase):
_fields_ = [
('Type', ctypes.c_uint32),
]
class IndexStruct(StructBase):
_fields_ = [
('Seq', ctypes.c_uint32),
('ExID', ctypes.c_char * 8),
('SecID', ctypes.c_char * 8),
('SecName', ctypes.c_char * 16),
('SourceID', ctypes.c_int32),
('Time', ctypes.c_uint32),
('PreClose', ctypes.c_uint32),
('Open', ctypes.c_uint32),
('High', ctypes.c_uint32),
('Low', ctypes.c_uint32),
('Match', ctypes.c_uint32),
]
df = IndexStruct.from_struct_binary('your path')
print(df)

