Iscsi

iPXE 引導期間的 iSCSI 讀取錯誤

  • July 28, 2021

編輯:大量更新 2019-02-16 以包含其他故障排除資訊。

多年來,我已經擁有 iPXE 和 iSCSI 環境,但我第一次嘗試進行 iSCSI 引導,而 iPXE 與 iSCSI 目標的對話出現問題。

儲存伺服器

CentOS Linux release 7.6.1810 (Core)
Linux san1srvp01.********.net 3.10.0-957.5.1.el7.x86_64 #1 SMP Fri Feb 1 14:54:57 UTC 2019 x86_64 x86_64 x86_64 GNU/Linux
zfs-0.7.12-1.el7_6.x86_64

支持block實例

Disk /dev/zpool1/jane: 8422 MB, 8422687232 bytes, 16450561 sectors
Units = sectors of 1 * 512 = 512 bytes
Sector size (logical/physical): 512 bytes / 512 bytes
I/O size (minimum/optimal): 512 bytes / 512 bytes
Disk label type: dos
Disk identifier: 0xa9a554b4

          Device Boot      Start         End      Blocks   Id  System
/dev/zpool1/jane1              63       80324       40131   12  Compaq diagnostics
/dev/zpool1/jane2   *       80325    16434494     8177085    7  HPFS/NTFS/exFAT

支持fileio實例

Disk /zpool1/nas1/Media/c0d0.img: 8422 MB, 8422686720 bytes, 16450560 sectors
Units = sectors of 1 * 512 = 512 bytes
Sector size (logical/physical): 512 bytes / 512 bytes
I/O size (minimum/optimal): 512 bytes / 512 bytes
Disk label type: dos
Disk identifier: 0xa9a554b4

                     Device Boot      Start         End      Blocks   Id  System
/zpool1/nas1/Media/c0d0.img1              63       80324       40131   12  Compaq diagnostics
/zpool1/nas1/Media/c0d0.img2   *       80325    16434494     8177085    7  HPFS/NTFS/exFAT

簡化的 iSCSI 目標配置。這是使用 ZFS zvol 的塊範例,我也嘗試了 fileio,它的行為沒有什麼不同。

{
 "fabric_modules": [
   {
     "discovery_enable_auth": true,
     "discovery_password": "********************************",
     "discovery_userid": "san1srvp01",
     "name": "iscsi"
   }
 ],
 "storage_objects": [
   {
     "alua_tpgs": [
       {
         "alua_access_state": 0,
         "alua_access_status": 0,
         "alua_access_type": 3,
         "alua_support_active_nonoptimized": 1,
         "alua_support_active_optimized": 1,
         "alua_support_offline": 1,
         "alua_support_standby": 1,
         "alua_support_transitioning": 1,
         "alua_support_unavailable": 1,
         "alua_write_metadata": 0,
         "implicit_trans_secs": 0,
         "name": "default_tg_pt_gp",
         "nonop_delay_msecs": 100,
         "preferred": 0,
         "tg_pt_gp_id": 0,
         "trans_delay_msecs": 0
       }
     ],
     "attributes": {
       "block_size": 512,
       "emulate_3pc": 1,
       "emulate_caw": 1,
       "emulate_dpo": 1,
       "emulate_fua_read": 1,
       "emulate_fua_write": 1,
       "emulate_model_alias": 1,
       "emulate_rest_reord": 0,
       "emulate_tas": 1,
       "emulate_tpu": 0,
       "emulate_tpws": 0,
       "emulate_ua_intlck_ctrl": 0,
       "emulate_write_cache": 0,
       "enforce_pr_isids": 1,
       "force_pr_aptpl": 0,
       "is_nonrot": 1,
       "max_unmap_block_desc_count": 1,
       "max_unmap_lba_count": 262144,
       "max_write_same_len": 65535,
       "optimal_sectors": 32768,
       "pi_prot_format": 0,
       "pi_prot_type": 0,
       "queue_depth": 128,
       "unmap_granularity": 16,
       "unmap_granularity_alignment": 0,
       "unmap_zeroes_data": 0
     },
     "dev": "/dev/zpool1/jane",
     "name": "jane",
     "plugin": "block",
     "readonly": false,
     "write_back": false,
     "wwn": "8688850f-7200-48a0-ad32-0f4f9397a836"
   }
 ],
 "targets": [
   {
     "fabric": "iscsi",
     "tpgs": [
       {
         "attributes": {
           "authentication": 0,
           "cache_dynamic_acls": 0,
           "default_cmdsn_depth": 64,
           "default_erl": 0,
           "demo_mode_discovery": 1,
           "demo_mode_write_protect": 1,
           "fabric_prot_type": 0,
           "generate_node_acls": 0,
           "login_timeout": 15,
           "netif_timeout": 2,
           "prod_mode_write_protect": 0,
           "t10_pi": 0,
           "tpg_enabled_sendtargets": 1
         },
         "enable": true,
         "luns": [
           {
             "alias": "414d07d6b4",
             "alua_tg_pt_gp_name": "default_tg_pt_gp",
             "index": 2,
             "storage_object": "/backstores/block/jane"
           }
         ],
         "node_acls": [
           {
             "attributes": {
               "dataout_timeout": 3,
               "dataout_timeout_retries": 5,
               "default_erl": 0,
               "nopin_response_timeout": 30,
               "nopin_timeout": 15,
               "random_datain_pdu_offsets": 0,
               "random_datain_seq_offsets": 0,
               "random_r2t_offsets": 0
             },
             "chap_mutual_password": "****************",
             "chap_mutual_userid": "san1srvp01",
             "chap_password": "****************",
             "chap_userid": "jane",
             "mapped_luns": [
               {
                 "alias": "c8ce872be3",
                 "index": 2,
                 "tpg_lun": 2,
                 "write_protect": false
               }
             ],
             "node_wwn": "iqn.1999-10.net.********:jane"
           }
         ],
         "parameters": {
           "AuthMethod": "CHAP,None",
           "DataDigest": "CRC32C,None",
           "DataPDUInOrder": "Yes",
           "DataSequenceInOrder": "Yes",
           "DefaultTime2Retain": "20",
           "DefaultTime2Wait": "2",
           "ErrorRecoveryLevel": "0",
           "FirstBurstLength": "65536",
           "HeaderDigest": "CRC32C,None",
           "IFMarkInt": "2048~65535",
           "IFMarker": "No",
           "ImmediateData": "Yes",
           "InitialR2T": "Yes",
           "MaxBurstLength": "262144",
           "MaxConnections": "1",
           "MaxOutstandingR2T": "1",
           "MaxRecvDataSegmentLength": "8192",
           "MaxXmitDataSegmentLength": "262144",
           "OFMarkInt": "2048~65535",
           "OFMarker": "No",
           "TargetAlias": "LIO Target"
         },
         "portals": [
           {
             "ip_address": "192.168.40.1",
             "iser": false,
             "offload": false,
             "port": 3260
           }
         ],
         "tag": 1
       }
     ],
     "wwn": "iqn.1999-10.net.********:san1srvp01"
   }
 ]
}

PXE/iPXE/TFTP/HTTP 伺服器

CentOS release 6.10 (Final)
Linux sy1srvp01.********.net 2.6.32-754.10.1.el6.i686 #1 SMP Tue Jan 15 17:33:10 UTC 2019 i686 i686 i386 GNU/Linux
tftp-0.49-8.el6.i686

iPXE 實現將首先按順序傳遞給與主機名、uuid 或 mac 匹配的腳本。這是此 Mac 的單獨 iPXE 啟動腳本mac-0007e90feaf5.ipxe

set username jane
set password ****************
set reverse-username san1srvp01
set reverse-password ****************
set initiator-iqn iqn.1999-10.net.********:jane
sanboot iscsi:192.168.40.1::::iqn.1999-10.net.********:san1srvp01

發起人

Compaq ML370 (Generation 0)
BIOS P17 (12/18/2002)
Processor 866/133 Mhz with 256k Cache
RAM 1 GB
Intel Boot Agent GE v1.2.22

PXE -> iPXE 鏈負載

PXE 2.1 Build 084 (WfM 2.0), RPL V1.25

PX->EB: PXE! at 9CC2:0070, entry point at 9CC2:0106
           UNDI code segment 9CC2:0000, data segment 969B:0000 (602-628kB)
           UNDI device is PCI 00:06.0, type DIX+802.3
           602kB free base memory after PXE unload

iPXE 1.0.0+ -- Open Source Network Boot Firmware -- http://ipxe.org
Features: DNS HTTP iSCSI TFTP AoE ELF MBOOT PXE bzImage Menu PXEXT

我使用網路跟踪來跟踪 iPXE sanbootiSCSI 引導過程。從高層次來看是:

  1. 登錄命令 (CHAP)
  2. 測試單元就緒
  3. 讀取容量(10)
  4. 讀取 (10) <- 失敗

首先,Read Capacity(10)為 LBA 返回一個意外的值 63,而不是 16450560。然後它嘗試在 LBA 64 處進行讀取(10),這可以預見地失敗並顯示Logical Block Address Out Of Range. 測試表明這是由 iPXE 和 LIO 之間的特定互動引起的,但確切原因尚不清楚。

讀取容量(10) - 請求

Frame 27: 114 bytes on wire (912 bits), 114 bytes captured (912 bits)
Ethernet II, Src: Intel_0f:ea:f5 (00:07:e9:0f:ea:f5), Dst: SuperMic_6c:a9:82 (00:25:90:6c:a9:82)
Internet Protocol Version 4, Src: 192.168.4.13, Dst: 192.168.40.1
Transmission Control Protocol, Src Port: cifs (3020), Dst Port: iscsi-target (3260), Seq: 773, Ack: 637, Len: 48
iSCSI (SCSI Command)
Flags: 0xc1, F, R, Attr: Simple
SCSI CDB Read Capacity(10)
   [LUN: 0x0000]
   [Command Set:Direct Access Device (0x00) (Using default commandset)]
   [Response in: 29]
   Opcode: Read Capacity(10) (0x25)
   Control: 0x00

讀取容量(10) - 響應

Frame 29: 74 bytes on wire (592 bits), 74 bytes captured (592 bits)
Ethernet II, Src: SuperMic_6c:a9:82 (00:25:90:6c:a9:82), Dst: Intel_0f:ea:f5 (00:07:e9:0f:ea:f5)
Internet Protocol Version 4, Src: 192.168.40.1, Dst: 192.168.4.13
Transmission Control Protocol, Src Port: iscsi-target (3260), Dst Port: cifs (3020), Seq: 685, Ack: 821, Len: 8
[2 Reassembled TCP Segments (56 bytes): #28(48), #29(8)]
iSCSI (SCSI Data In)
SCSI Payload (Read Capacity(10) Response Data)
   [LUN: 0x0000]
   [Command Set:Direct Access Device (0x00) (Using default commandset)]
   [SBC Opcode: Read Capacity(10) (0x25)]
   [Request in: 27]
   [Response in: 29]
   LBA: 63 (0 MB)
   Block size in bytes: 512
SCSI Response (Read Capacity(10))
   [LUN: 0x0000]
   [Command Set:Direct Access Device (0x00) (Using default commandset)]
   [SBC Opcode: Read Capacity(10) (0x25)]
   [Request in: 27]
   [Time from request: 0.000252000 seconds]
   [Status: Good (0x00)]

讀取(10) - 請求

Frame 32: 114 bytes on wire (912 bits), 114 bytes captured (912 bits)
Ethernet II, Src: Intel_0f:ea:f5 (00:07:e9:0f:ea:f5), Dst: SuperMic_6c:a9:82 (00:25:90:6c:a9:82)
Internet Protocol Version 4, Src: 192.168.4.13, Dst: 192.168.40.1
Transmission Control Protocol, Src Port: cifs (3020), Dst Port: iscsi-target (3260), Seq: 821, Ack: 693, Len: 48
iSCSI (SCSI Command)
Flags: 0xc1, F, R, Attr: Simple
SCSI CDB Read(10)
   [LUN: 0x0000]
   [Command Set:Direct Access Device (0x00) (Using default commandset)]
   [Response in: 33]
   Opcode: Read(10) (0x28)
   Flags: 0x00
   Logical Block Address (LBA): 64
   ...0 0000 = Group: 0x00
   Transfer Length: 4
   Control: 0x00

讀取 (10) - 響應

Frame 33: 214 bytes on wire (1712 bits), 214 bytes captured (1712 bits)
Ethernet II, Src: SuperMic_6c:a9:82 (00:25:90:6c:a9:82), Dst: Intel_0f:ea:f5 (00:07:e9:0f:ea:f5)
Internet Protocol Version 4, Src: 192.168.40.1, Dst: 192.168.4.13
Transmission Control Protocol, Src Port: iscsi-target (3260), Dst Port: cifs (3020), Seq: 693, Ack: 869, Len: 148
iSCSI (SCSI Response)
Flags: 0x80
SCSI: SNS Info
   [LUN: 0x0000]
   .111 0000 = SNS Error Type: Current Error (0x70)
   Valid: 112
   0... .... = Filemark: False
   .0.. .... = EOM: False
   ..0. .... = ILI: False
   .... 0101 = Sense Key: Illegal Request (0x5)
   Sense Info: 0x00000000
   Additional Sense Length: 10
   Command-Specific Information: 00000000
   Additional Sense Code+Qualifier: Logical Block Address Out Of Range (0x2100)
   Field Replaceable Unit Code: 0x00
   0... .... = SKSV: False
   .000 0000 0000 0000 0000 0000 = Sense Key Specific: 0x000000

iPXE 對控制台的響應

Could not open SAN device: Input/output error (http://ipxe.org/1d704039
Could not boot image: Input/output error (http://ipxe.org/1d704039

iSCSI 目標上記錄的消息

Feb 13 09:17:41 san1srvp01 kernel: cmd exceeds last lba 64 (lba 64, sectors 4)

測試和故障排除

  • 嘗試使用此 iSCSI LUN 引導 VM 時觀察到相同的行為,排除了物理機及其網卡。
  • 使用本機 Linux 啟動器掛載時,iSCSI 設備行為正確,並且該設備最初是使用 dd 將映像文件複製到 iSCSI 掛載上填充的。
  • 我創建了一個 iPXE 的自定義建構,它強制Read Capacity(16)Read(16),但無濟於事。
  • 在這種情況下,我發現了一個記錄在案的類似行為實例,該實例被確定是由Login階段期間提供(或不提供)的操作參數引起的。作為回應,我創建了一個自定義建構,其參數和參數值與使用本機 Linux 啟動器在工作掛載期間觀察到的參數和參數值相同,但無濟於事。
  • 我嘗試將 fileio 備份儲存映像從 ZFS 移動到 xfs 卷,但無濟於事。
  • 我已經嘗試使用零初始化塊設備的 iPXE sanboot,但無濟於事。這表明該問題與分區或塊設備內容無關。

問題

  • 任何人都可以證明具有與此等效的工作設置嗎?
  • 有誰知道這個設置有明確的問題?
  • 有誰知道什麼會導致 LIO 以這種方式行事?
  • 最難的問題是,有人知道如何解決嗎?

-TIA

該卷是儲存伺服器上的 LUN 2,iPXE 需要明確定址 LUN 2。儲存伺服器甚至沒有 LUN 0,所以不清楚為什麼Test Unit Ready成功。這麼簡單的事情有點尷尬。

引用自:https://serverfault.com/questions/953966