练习:Protobuf 解析

在本练习中,您将为 protobuf 二进制编码 构建一个解析器。别担心,其实非常简单!这展示了一种常见的解析模式,即传递数据 slice。底层数据本身永远不会被复制。

如要完整解析 protobuf 消息,需要知道字段的类型(按字段编号编入索引)。这通常会在 proto 文件中提供。在本练习中,我们将把这些信息编码成处理每个字段所调用的函数中的 match 语句。

我们将使用以下 proto:

  1. message PhoneNumber {
  2. optional string number = 1;
  3. optional string type = 2;
  4. }
  5. message Person {
  6. optional string name = 1;
  7. optional int32 id = 2;
  8. repeated PhoneNumber phones = 3;
  9. }

proto 消息被编码为连续的一系列字段。每个字段都通过 “标签”后面紧跟值的形式来实现。标签包含一个字段编号(例如Person 消息的 id 字段的值为 2)和线型(用于定义应如何从字节流确定载荷)。

整数(包括标签)使用名为 VARINT 的可变长度编码表示。幸运的是,下面为您提供了 parse_varint 的定义。该指定代码还定义了一些回调,用于处理 PersonPhoneNumber 字段,并将消息解析为对这些回调的一系列调用。

What remains for you is to implement the parse_field function and the ProtoMessage trait for Person and PhoneNumber.

  1. use std::convert::TryFrom;
  2. use thiserror::Error;
  3. #[derive(Debug, Error)]
  4. enum Error {
  5. #[error("Invalid varint")]
  6. InvalidVarint,
  7. #[error("Invalid wire-type")]
  8. InvalidWireType,
  9. #[error("Unexpected EOF")]
  10. UnexpectedEOF,
  11. #[error("Invalid length")]
  12. InvalidSize(#[from] std::num::TryFromIntError),
  13. #[error("Unexpected wire-type)")]
  14. UnexpectedWireType,
  15. #[error("Invalid string (not UTF-8)")]
  16. InvalidString,
  17. }
  18. /// A wire type as seen on the wire.
  19. enum WireType {
  20. /// Varint WireType 表明该值为单个 VARINT。
  21. Varint,
  22. //I64, -- not needed for this exercise
  23. /// The Len WireType indicates that the value is a length represented as a
  24. /// VARINT followed by exactly that number of bytes.
  25. Len,
  26. /// The I32 WireType indicates that the value is precisely 4 bytes in
  27. /// little-endian order containing a 32-bit signed integer.
  28. I32,
  29. }
  30. #[derive(Debug)]
  31. /// A field's value, typed based on the wire type.
  32. enum FieldValue<'a> {
  33. Varint(u64),
  34. //I64(i64), -- not needed for this exercise
  35. Len(&'a [u8]),
  36. I32(i32),
  37. }
  38. #[derive(Debug)]
  39. /// A field, containing the field number and its value.
  40. struct Field<'a> {
  41. field_num: u64,
  42. value: FieldValue<'a>,
  43. }
  44. trait ProtoMessage<'a>: Default + 'a {
  45. fn add_field(&mut self, field: Field<'a>) -> Result<(), Error>;
  46. }
  47. impl TryFrom<u64> for WireType {
  48. type Error = Error;
  49. fn try_from(value: u64) -> Result<WireType, Error> {
  50. Ok(match value {
  51. 0 => WireType::Varint,
  52. //1 => WireType::I64, -- not needed for this exercise
  53. 2 => WireType::Len,
  54. 5 => WireType::I32,
  55. _ => return Err(Error::InvalidWireType),
  56. })
  57. }
  58. }
  59. impl<'a> FieldValue<'a> {
  60. fn as_string(&self) -> Result<&'a str, Error> {
  61. let FieldValue::Len(data) = self else {
  62. return Err(Error::UnexpectedWireType);
  63. };
  64. std::str::from_utf8(data).map_err(|_| Error::InvalidString)
  65. }
  66. fn as_bytes(&self) -> Result<&'a [u8], Error> {
  67. let FieldValue::Len(data) = self else {
  68. return Err(Error::UnexpectedWireType);
  69. };
  70. Ok(data)
  71. }
  72. fn as_u64(&self) -> Result<u64, Error> {
  73. let FieldValue::Varint(value) = self else {
  74. return Err(Error::UnexpectedWireType);
  75. };
  76. Ok(*value)
  77. }
  78. }
  79. /// Parse a VARINT, returning the parsed value and the remaining bytes.
  80. fn parse_varint(data: &[u8]) -> Result<(u64, &[u8]), Error> {
  81. for i in 0..7 {
  82. let Some(b) = data.get(i) else {
  83. return Err(Error::InvalidVarint);
  84. };
  85. if b & 0x80 == 0 {
  86. // This is the last byte of the VARINT, so convert it to
  87. // a u64 and return it.
  88. let mut value = 0u64;
  89. for b in data[..=i].iter().rev() {
  90. value = (value << 7) | (b & 0x7f) as u64;
  91. }
  92. return Ok((value, &data[i + 1..]));
  93. }
  94. }
  95. // More than 7 bytes is invalid.
  96. Err(Error::InvalidVarint)
  97. }
  98. /// Convert a tag into a field number and a WireType.
  99. fn unpack_tag(tag: u64) -> Result<(u64, WireType), Error> {
  100. let field_num = tag >> 3;
  101. let wire_type = WireType::try_from(tag & 0x7)?;
  102. Ok((field_num, wire_type))
  103. }
  104. /// Parse a field, returning the remaining bytes
  105. fn parse_field(data: &[u8]) -> Result<(Field, &[u8]), Error> {
  106. let (tag, remainder) = parse_varint(data)?;
  107. let (field_num, wire_type) = unpack_tag(tag)?;
  108. let (fieldvalue, remainder) = match wire_type {
  109. _ => todo!("Based on the wire type, build a Field, consuming as many bytes as necessary.")
  110. };
  111. todo!("Return the field, and any un-consumed bytes.")
  112. }
  113. /// Parse a message in the given data, calling `T::add_field` for each field in
  114. /// the message.
  115. ///
  116. /// The entire input is consumed.
  117. fn parse_message<'a, T: ProtoMessage<'a>>(mut data: &'a [u8]) -> Result<T, Error> {
  118. let mut result = T::default();
  119. while !data.is_empty() {
  120. let parsed = parse_field(data)?;
  121. result.add_field(parsed.0)?;
  122. data = parsed.1;
  123. }
  124. Ok(result)
  125. }
  126. #[derive(Debug, Default)]
  127. struct PhoneNumber<'a> {
  128. number: &'a str,
  129. type_: &'a str,
  130. }
  131. #[derive(Debug, Default)]
  132. struct Person<'a> {
  133. name: &'a str,
  134. id: u64,
  135. phone: Vec<PhoneNumber<'a>>,
  136. }
  137. // TODO: Implement ProtoMessage for Person and PhoneNumber.
  138. fn main() {
  139. let person: Person = parse_message(&[
  140. 0x0a, 0x07, 0x6d, 0x61, 0x78, 0x77, 0x65, 0x6c, 0x6c, 0x10, 0x2a, 0x1a,
  141. 0x16, 0x0a, 0x0e, 0x2b, 0x31, 0x32, 0x30, 0x32, 0x2d, 0x35, 0x35, 0x35,
  142. 0x2d, 0x31, 0x32, 0x31, 0x32, 0x12, 0x04, 0x68, 0x6f, 0x6d, 0x65, 0x1a,
  143. 0x18, 0x0a, 0x0e, 0x2b, 0x31, 0x38, 0x30, 0x30, 0x2d, 0x38, 0x36, 0x37,
  144. 0x2d, 0x35, 0x33, 0x30, 0x38, 0x12, 0x06, 0x6d, 0x6f, 0x62, 0x69, 0x6c,
  145. 0x65,
  146. ])
  147. .unwrap();
  148. println!("{:#?}", person);
  149. }