User Tools

Site Tools


acelab:install_and_configure

* Verify that the server has Mellanox network adapter installed (HCA/NIC)

 [root@bright1 OFED]# lspci -v|grep Mellanox
 02:00.0 Network controller: Mellanox Technologies MT27500 Family [ConnectX-3]
 Subsystem: Mellanox Technologies Device 0067
 [root@bright1 OFED]#

NB: For all management servers, this is true.

* Since we using M3601Q 32-Port 40G IB Switch on the M1000e, verify what the compute nodes on the M1000e have

 [root@bright1 OFED]# pdsh -w cnode01 "lspci -v|grep Mellanox"
 cnode01: 04:00.0 InfiniBand: Mellanox Technologies MT26428 [ConnectX VPI PCIe 2.0 5GT/s - IB QDR / 10GigE] 
 (rev b0)
 cnode01:  Subsystem: Mellanox Technologies MT26428 [ConnectX VPI PCIe 2.0 5GT/s - IB QDR / 10GigE]
 [root@bright1 OFED]#

NB: For all compute nodes, this is true

* Bright1 server is running opensm

 [root@bright1 OFED]# systemctl status opensm
 ● opensm.service - Starts the OpenSM InfiniBand fabric Subnet Manager
   Loaded: loaded (/usr/lib/systemd/system/opensm.service; enabled; vendor preset: disabled)
   Active: active (running) since Mon 2017-08-21 14:39:36 SAST; 21h ago
     Docs: man:opensm
 Main PID: 947 (opensm-launch)
   CGroup: /system.slice/opensm.service
           ├─947 /bin/bash /usr/libexec/opensm-launch
           └─948 /usr/sbin/opensm

* Remove a few confliting packages and kernel modules (installed by Bright)

 [root@bright1 MLNX_OFED_LINUX-3.4-2.1.8.0-rhel7.3-x86_64]# yum remove compat-dapl-devel compat-dapl-utils
 [root@bright1 MLNX_OFED_LINUX-3.4-2.1.8.0-rhel7.3-x86_64]# /etc/init.d/opensmd status
 opensm (pid 23365) is running...
 [root@bright1 MLNX_OFED_LINUX-3.4-2.1.8.0-rhel7.3-x86_64]# /etc/init.d/opensmd stop
 Stopping opensmd (via systemctl):                          [  OK  ]
 [root@bright1 MLNX_OFED_LINUX-3.4-2.1.8.0-rhel7.3-x86_64]#
 [root@bright1 ~]# modprobe -r ib_isert rpcrdma ib_srpt ib_srp ib_ucm ib_ipoib ib_iser rdma_ucm rdma_cm 
 ib_cm

* Install OFED

 [root@bright1 MLNX_OFED_LINUX-3.4-2.1.8.0-rhel7.3-x86_64]# ./mlnxofedinstall

* Restart openibd7

 
 [root@bright1 MLNX_OFED_LINUX-3.4-2.1.8.0-rhel7.3-x86_64]# /etc/init.d/openibd restart
 Unloading HCA driver:                                      [  OK  ]
 Loading HCA driver and Access Layer:                       [  OK  ]
 [root@bright1 MLNX_OFED_LINUX-3.4-2.1.8.0-rhel7.3-x86_64]# /etc/init.d/openibd status

   HCA driver loaded

 Configured IPoIB devices:
 ib0

 Currently active IPoIB devices:
 ib0
 Configured Mellanox EN devices:

 Currently active Mellanox devices:
 ib0

 The following OFED modules are loaded:

   rdma_ucm
   rdma_cm
   ib_addr
   ib_ipoib
   mlx4_core
   mlx4_ib
   mlx4_en
   mlx5_core
   mlx5_ib
  ib_uverbs
  ib_umad
  ib_ucm
  ib_sa
  ib_cm
  ib_mad
  ib_core
  ib_netlink

 [root@bright1 MLNX_OFED_LINUX-3.4-2.1.8.0-rhel7.3-x86_64]# /etc/init.d/opensmd status
 opensm (pid 27388) is running...
 [root@bright1 MLNX_OFED_LINUX-3.4-2.1.8.0-rhel7.3-x86_64]#

* Install on bright2 just to test again and do a ibping test

[root@bright1 MLNX_OFED_LINUX-3.4-2.1.8.0-rhel7.3-x86_64]# ibstat
CA 'mlx4_0'
CA type: MT4099
Number of ports: 1
Firmware version: 2.36.5000
Hardware version: 1
Node GUID: 0x248a070300bc48a0
System image GUID: 0x248a070300bc48a3
Port 1:
State: Active
Physical state: LinkUp
Rate: 40
Base lid: 1
LMC: 0
SM lid: 1
Capability mask: 0x0251486a
Port GUID: 0x248a070300bc48a1
Link layer: InfiniBand
[root@bright1 MLNX_OFED_LINUX-3.4-2.1.8.0-rhel7.3-x86_64]# ibping -S

[root@bright2 ~]# ibping -G 0x248a070300bc48a1
Pong from bright1.(none) (Lid 1): time 0.140 ms
Pong from bright1.(none) (Lid 1): time 0.170 ms
Pong from bright1.(none) (Lid 1): time 0.167 ms
Pong from bright1.(none) (Lid 1): time 0.103 ms
Pong from bright1.(none) (Lid 1): time 0.098 ms
Pong from bright1.(none) (Lid 1): time 0.202 ms
Pong from bright1.(none) (Lid 1): time 0.165 ms
Pong from bright1.(none) (Lid 1): time 0.156 ms

* Install on all the nodes * Bring up ib0 on all the nodes

 [root@bright1 ~]# pdsh -g all ifup ib0
 bright2:
 imlnode:
 cnode01:
 cnode02:
 nfs01:
 mds02:
 oss02:
 cnode03:
 cnode04:
 sched:
 login1:
 cnode15:
 oss01:
 login2:
 nfs02:
 cnode13:
 cnode11:
 mds01:
 cnode12:
 cnode05:
 cnode14:
 cnode07:
 cnode16:
 cnode08:
 cnode10:
 cnode09:
 cnode06:
 imlnode: Determining IP information for ib0... done.
 bright2: Determining IP information for ib0... done.
 cnode01: Determining IP information for ib0... done.
 cnode02: Determining IP information for ib0... done.
 sched: Determining IP information for ib0... done.
 login1: Determining IP information for ib0... done.
 login2: Determining IP information for ib0... done.
 cnode03: Determining IP information for ib0... done.
 cnode04: Determining IP information for ib0... done.
 cnode15: Determining IP information for ib0... done.
 mds01: Determining IP information for ib0... done.
 cnode13: Determining IP information for ib0... done.
 cnode11: Determining IP information for ib0... done.
 cnode12: Determining IP information for ib0... done.
 cnode05: Determining IP information for ib0... done.
 cnode16: Determining IP information for ib0... done.
 cnode07: Determining IP information for ib0... done.
 cnode08: Determining IP information for ib0... done.
 cnode09: Determining IP information for ib0... done.
 cnode10: Determining IP information for ib0... done.
 cnode14: Determining IP information for ib0... done.
 cnode06: Determining IP information for ib0... done.
 mds02: Determining IP information for ib0... done.
 nfs02: Determining IP information for ib0... done.
 oss02: Determining IP information for ib0... done.
 nfs01: Determining IP information for ib0... done.
 oss01: Determining IP information for ib0... done.
 [root@bright1 ~]#

* Show ib addresses

 [root@bright1 ~]# pdsh -g all "ip addr show ib0|grep -w inet"|sort -n
 bright1:     inet 172.20.0.1/24 brd 172.20.0.255 scope global ib0
 bright2:     inet 172.20.0.214/24 brd 172.20.0.255 scope global dynamic ib0
 cnode01:     inet 172.20.0.211/24 brd 172.20.0.255 scope global dynamic ib0
 cnode02:     inet 172.20.0.210/24 brd 172.20.0.255 scope global dynamic ib0
 cnode03:     inet 172.20.0.209/24 brd 172.20.0.255 scope global dynamic ib0
 cnode04:     inet 172.20.0.212/24 brd 172.20.0.255 scope global dynamic ib0
 cnode05:     inet 172.20.0.202/24 brd 172.20.0.255 scope global dynamic ib0
 cnode06:     inet 172.20.0.197/24 brd 172.20.0.255 scope global dynamic ib0
 cnode07:     inet 172.20.0.200/24 brd 172.20.0.255 scope global dynamic ib0
 cnode08:     inet 172.20.0.201/24 brd 172.20.0.255 scope global dynamic ib0
 cnode09:     inet 172.20.0.199/24 brd 172.20.0.255 scope global dynamic ib0
 cnode10:     inet 172.20.0.204/24 brd 172.20.0.255 scope global dynamic ib0
 cnode11:     inet 172.20.0.198/24 brd 172.20.0.255 scope global dynamic ib0
 cnode12:     inet 172.20.0.208/24 brd 172.20.0.255 scope global dynamic ib0
 cnode13:     inet 172.20.0.203/24 brd 172.20.0.255 scope global dynamic ib0
 cnode14:     inet 172.20.0.205/24 brd 172.20.0.255 scope global dynamic ib0
 cnode15:     inet 172.20.0.207/24 brd 172.20.0.255 scope global dynamic ib0
 cnode16:     inet 172.20.0.206/24 brd 172.20.0.255 scope global dynamic ib0
 imlnode:     inet 172.20.0.217/24 brd 172.20.0.255 scope global ib0
 login1:     inet 172.20.0.216/24 brd 172.20.0.255 scope global dynamic ib0
 login2:     inet 172.20.0.215/24 brd 172.20.0.255 scope global dynamic ib0
 mds01:     inet 172.20.0.222/24 brd 172.20.0.255 scope global ib0
 mds02:     inet 172.20.0.221/24 brd 172.20.0.255 scope global ib0
 nfs01:     inet 172.20.0.220/24 brd 172.20.0.255 scope global ib0
 nfs02:     inet 172.20.0.223/24 brd 172.20.0.255 scope global ib0
 oss01:     inet 172.20.0.218/24 brd 172.20.0.255 scope global ib0
 oss02:     inet 172.20.0.219/24 brd 172.20.0.255 scope global ib0
 sched:     inet 172.20.0.213/24 brd 172.20.0.255 scope global dynamic ib0

* Opensm (Things) - default opensm installed by bright cluster manager

 [root@bright1 ~]# systemctl stop opensm
 [root@bright1 ~]# opensm --create-config /etc/opensm/opensm.conf
 -------------------------------------------------
 OpenSM 4.8.0.MLNX20161013.9b1a49b
 Command Line Arguments:
  Creating config file template '/etc/opensm/opensm.conf'.
  Log File: /var/log/opensm.log
 -------------------------------------------------
 [root@bright1 ~]#
/var/www/wiki/data/pages/acelab/install_and_configure.txt · Last modified: 2018/07/27 19:12 by smasoka