最近在学习docker的相关东西。在看到docker的网络模型的时候,发现docker提供了较为丰富的网络配置用以满足不同的应用场景:

  • host模式,使用--net=host指定。
  • container模式,使用--net=container:NAME_or_ID指定。
  • none模式,使用--net=none指定。
  • bridge模式,使用--net=bridge指定,默认设置。

使用host模式的话,这个docker实例的net namespace和宿主机一样,也就是说不是相互隔离的。各个容器不隔离也就带来了如下问题,一个端口只能使用1次,可扩展性比较差。而使用bridge模式就可以针对不同的docker实例设置不同的net namespace,它们之间的网络是相互隔离的,这也就为在同一台机器上部署两个相同的docker实例提供了可能 。

docker自带的bridge模式比较简单。各个容器的ip地址是172.17..,和默认网桥的ip(172.17.42.1)在同一网段。但是它们的ip和宿主机不在同一网段,也就是对外界通信的时候需要进行NAT转换,这样极不方便而且效率也不是很高。有什么解决方法呢?如果可以将各个docker实例的ip设置的和宿主机在同一网段,那么它们就可以直接进行通信了。有什么方法可以方便的做到这点呢?pipework就可以~~

pipework的github地址如下:https://github.com/jpetazzo/pipework

里面有很详尽的使用说明例子,并且网上也有不少,这里就不在多说了。下面主要分析下pipework的源码,目的主要是学习一下linux网络相关的知识,更好的理解docker模型,同时也是为了熟悉下shell脚本的基本用法。

下面只针对linux + 网桥 + 静态ip分配做了注解,其他的一些功能由于暂时还没有用到所以这里就先不说明了。


#!/bin/sh

# This code should (try to) follow Google's Shell Style Guide

# (https://google-styleguide.googlecode.com/svn/trunk/shell.xml)



### 当bash脚本执行出现错误的时候立刻退出脚本。

### help set中可以看到:-e  Exit immediately if a command exits with a non-zero status.

set -e



### $1获得第一个参数,如果参数等于--wait,则设置WAIT为1

### It will wait until the eth1 interface is present and in UP operational state, then exit gracefully.

### 该命令推荐在container内部使用

case "$1" in

  --wait)

    WAIT=1

    ;;

esac



### IFNAME设置为第一个参数

IFNAME=$1



### 设置CONTAINER_IFNAME的值

# default value set further down if not set here

CONTAINER_IFNAME=

if [ "$2" = "-i" ]; then

  CONTAINER_IFNAME=$3

  ### shift代表参数左移2位,也就是参数从1,2,3...变为3,...

  shift 2

fi



if [ "$2" = "-l" ]; then

  LOCAL_IFNAME=$3

  shift 2

fi



### 获得CONTAINER的名称,IP地址,mac地址

GUESTNAME=$2

IPADDR=$3

MACADDR=$4



### 当MAC地址满足*@*的时候,对VLAN和MACADDR进行赋值

case "$MACADDR" in

  *@*)

    VLAN="${MACADDR#*@}"

    VLAN="${VLAN%%@*}"

    MACADDR="${MACADDR%%@*}"

    ;;

  *)

    VLAN=

    ;;

esac



### 当IP地址为空或者WAIT为空的时候格式错误

[ "$IPADDR" ] || [ "$WAIT" ] || {

  echo "Syntax:"

  echo "pipework <hostinterface> [-i containerinterface] [-l localinterfacename] <guest> <ipaddr>/<subnet>[@default_gateway] [macaddr][@vlan]"

  echo "pipework <hostinterface> [-i containerinterface] [-l localinterfacename] <guest> dhcp [macaddr][@vlan]"

  echo "pipework --wait [-i containerinterface]"

  exit 1

}



# Succeed if the given utility is installed. Fail otherwise.

# For explanations about `which` vs `type` vs `command`, see:

# http://stackoverflow.com/questions/592620/check-if-a-program-exists-from-a-bash-script/677212#677212

# (Thanks to @chenhanxiao for pointing this out!)



### >/dev/null 2>&1用于输出重定向

### http://stackoverflow.com/questions/10508843/what-is-dev-null-21



installed () {

  command -v "$1" >/dev/null 2>&1

}



# Google Styleguide says error messages should go to standard error.

warn () {

  echo "$@" >&2

}

die () {

  status="$1"

  shift

  warn "$@"

  exit "$status"

}



# First step: determine type of first argument (bridge, physical interface...),

# Unless "--wait" is set (then skip the whole section)



### WAIT没有值的时候

if [ -z "$WAIT" ]; then

  if [ -d "/sys/class/net/$IFNAME" ] ### 判断网桥对应的目录存不存在

  then

    if [ -d "/sys/class/net/$IFNAME/bridge" ]; then ###bridge存在

      IFTYPE=bridge

      BRTYPE=linux

    elif installed ovs-vsctl && ovs-vsctl list-br|grep -q "^${IFNAME}$"; then ###当ovs_vsctl存在并且能够找到对应的网桥的时候

      IFTYPE=bridge

      BRTYPE=openvswitch

    elif [ "$(cat "/sys/class/net/$IFNAME/type")" -eq 32 ]; then # InfiniBand IPoIB interface type 32

      IFTYPE=ipoib

      # The IPoIB kernel module is fussy, set device name to ib0 if not overridden

      CONTAINER_IFNAME=${CONTAINER_IFNAME:-ib0}

      PKEY=$VLAN

    else IFTYPE=phys

    fi

  else ###网桥不存在,判断输入的网桥的名称如果是以br开通偶的则设置对应的BRTYPE为linux;否则判断是否安装了ovs-vsctl

    case "$IFNAME" in

      br*)

        IFTYPE=bridge

        BRTYPE=linux

        ;;

      ovs*)

        if ! installed ovs-vsctl; then

          die 1 "Need OVS installed on the system to create an ovs bridge"

        fi

        IFTYPE=bridge

        BRTYPE=openvswitch

        ;;

      *) die 1 "I do not know how to setup interface $IFNAME." ;;

    esac

  fi

fi



# Set the default container interface name to eth1 if not already set

CONTAINER_IFNAME=${CONTAINER_IFNAME:-eth1}



### 当使用了--WAIT参数的时候,该脚本会虚线的循环下去 直到对应的设备创建成功

### 才会退出循环

[ "$WAIT" ] && {

  while true; do

    # This first method works even without `ip` or `ifconfig` installed,

    # but doesn't work on older kernels (e.g. CentOS 6.X). See #128.

    grep -q '^1$' "/sys/class/net/$CONTAINER_IFNAME/carrier" && break

    # This method hopefully works on those older kernels.

    ip link ls dev "$CONTAINER_IFNAME" && break

    sleep 1

  done > /dev/null 2>&1 ###后台运行

  exit 0

}



### 几种不支持的类型

[ "$IFTYPE" = bridge ] && [ "$BRTYPE" = linux ] && [ "$VLAN" ] && {

  die 1 "VLAN configuration currently unsupported for Linux bridge."

}



[ "$IFTYPE" = ipoib ] && [ "$MACADDR" ] && {

  die 1 "MACADDR configuration unsupported for IPoIB interfaces."

}



# Second step: find the guest (for now, we only support LXC containers)

while read _ mnt fstype options _; do

  [ "$fstype" != "cgroup" ] && continue

  echo "$options" | grep -qw devices || continue

  CGROUPMNT=$mnt

done < /proc/mounts



[ "$CGROUPMNT" ] || {

    die 1 "Could not locate cgroup mount point."

}



# Try to find a cgroup matching exactly the provided name.

N=$(find "$CGROUPMNT" -name "$GUESTNAME" | wc -l)

case "$N" in

  0)

    # If we didn't find anything, try to lookup the container with Docker.

    if installed docker; then

      RETRIES=3

      while [ "$RETRIES" -gt 0 ]; do

        ### 检查对应的容器是否启动成功

        DOCKERPID=$(docker inspect --format='\{\{ .State.Pid \}\}' "$GUESTNAME")

        [ "$DOCKERPID" != 0 ] && break

        sleep 1

        RETRIES=$((RETRIES - 1))

      done



      [ "$DOCKERPID" = 0 ] && {

        die 1 "Docker inspect returned invalid PID 0"

      }



      [ "$DOCKERPID" = "<no value>" ] && {

        die 1 "Container $GUESTNAME not found, and unknown to Docker."

      }

    else

      die 1 "Container $GUESTNAME not found, and Docker not installed."

    fi

    ;;

  1) true ;;

  *) die 1 "Found more than one container matching $GUESTNAME." ;;

esac



### 判断所需要分配给container的ip地址是dhcp还是static

case "$IPADDR" in

  # Let's check first if the user asked for DHCP allocation.

  dhcp|dhcp:*)

    # Use Docker-specific strategy to run the DHCP client

    # from the busybox image, in the network namespace of

    # the container.

    if ! [ "$DOCKERPID" ]; then

      warn "You asked for a Docker-specific DHCP method."

      warn "However, $GUESTNAME doesn't seem to be a Docker container."

      warn "Try to replace 'dhcp' with another option?"

      die 1 "Aborting."

    fi

    DHCP_CLIENT=${IPADDR%%:*}

    ;;

  udhcpc|udhcpc:*|dhcpcd|dhcpcd:*|dhclient|dhclient:*)

    DHCP_CLIENT=${IPADDR%%:*}

    if ! installed "$DHCP_CLIENT"; then

      die 1 "You asked for DHCP client $DHCP_CLIENT, but I can't find it."

    fi

    ;;

  # Alright, no DHCP? Then let's see if we have a subnet *and* gateway.

  ### 设置的静态ip地址,并且设置了默认网关

  */*@*)

    ### ${string#substring}    从变量$string的开头, 删除最短匹配$substring的子串

    ### ${string##substring}   从变量$string的开头, 删除最长匹配$substring的子串

    ### ${string%substring}    从变量$string的结尾, 删除最短匹配$substring的子串

    ### ${string%%substring}   从变量$string的结尾, 删除最长匹配$substring的子串

    GATEWAY="${IPADDR#*@}" GATEWAY="${GATEWAY%%@*}"

    IPADDR="${IPADDR%%@*}"

    ;;

  # No gateway? We need at least a subnet, anyway!

  */*) : ;;

  # ... No? Then stop right here.

  *)

    warn "The IP address should include a netmask."

    die 1 "Maybe you meant $IPADDR/24 ?"

    ;;

esac



### 如果是DHCP

# If a DHCP method was specified, extract the DHCP options.

if [ "$DHCP_CLIENT" ]; then

  case "$IPADDR" in

    *:*) DHCP_OPTIONS="${IPADDR#*:}" ;;

  esac

fi



echo cgroupmnt $CGROUPMNT

echo guestname $GUESTNAME



if [ "$DOCKERPID" ]; then

  NSPID=$DOCKERPID

else

  ### find /sys/fs/cgroup/devices/ -name "xxxx"

  ### lxc-info -n "xxxx"

  ### 上面两种方法都是为了能够找到容器中的一个进程,方便获取net namespace用于网桥以及veth等操作

  NSPID=$(head -n 1 "$(find "$CGROUPMNT" -name "$GUESTNAME" | head -n 1)/tasks")

  [ "$NSPID" ] || {

    # it is an alternative way to get the pid

    NSPID=$(lxc-info -n  "$GUESTNAME" | grep PID | grep -Eo '[0-9]+')

    [ "$NSPID" ] || {

      die 1 "Could not find a process inside container $GUESTNAME."

    }

  }

fi



# Check if an incompatible VLAN device already exists

[ "$IFTYPE" = phys ] && [ "$VLAN" ] && [ -d "/sys/class/net/$IFNAME.VLAN" ] && {

  ip -d link show "$IFNAME.$VLAN" | grep -q "vlan.*id $VLAN" || {

    die 1 "$IFNAME.VLAN already exists but is not a VLAN device for tag $VLAN"

  }

}



[ ! -d /var/run/netns ] && mkdir -p /var/run/netns

rm -f "/var/run/netns/$NSPID"

ln -s "/proc/$NSPID/ns/net" "/var/run/netns/$NSPID"



# Check if we need to create a bridge.



### 检查网桥是否存在,如果不存在则创建

[ "$IFTYPE" = bridge ] && [ ! -d "/sys/class/net/$IFNAME" ] && {

  [ "$BRTYPE" = linux ] && {

    ### 创建网桥有两种方法

    ### 方法1:ip link add dev **** type bridge

    ### 方法2: brctl addbr ****

    (ip link add dev "$IFNAME" type bridge > /dev/null 2>&1) || (brctl addbr "$IFNAME")

    ip link set "$IFNAME" up

  }

  [ "$BRTYPE" = openvswitch ] && {

    ovs-vsctl add-br "$IFNAME"

  }

}



### 获取MTU(Maximum transmission unit) https://en.wikipedia.org/wiki/Maximum_transmission_unit

MTU=$(ip link show "$IFNAME" | awk '{print $5}')

# If it's a bridge, we need to create a veth pair

[ "$IFTYPE" = bridge ] && {

  if [ -z "$LOCAL_IFNAME" ]; then

    ### 如果LOCAL_IFNAME为空,则按照如下规则给LOCAL_IFNAME进行赋值;如果CONTAINER_IFNAME没有指定的话

    ### 则使用eth1,那么LOCAL_IFNAME形如veth1plxxxxx

    LOCAL_IFNAME="v${CONTAINER_IFNAME}pl${NSPID}"

  fi

  ### GUEST_IFNAME也是如此,不同的是local是pl,guest是pg

  GUEST_IFNAME="v${CONTAINER_IFNAME}pg${NSPID}"

  # Does the link already exist?

  if ip link show "$LOCAL_IFNAME" >/dev/null 2>&1; then

    # link exists, is it in use?

    if ip link show "$LOCAL_IFNAME" up | grep -q "UP"; then

      echo "Link $LOCAL_IFNAME exists and is up"

      exit 1

    fi

    # delete the link so we can re-add it afterwards

    ip link del "$LOCAL_IFNAME"

  fi

  ### 添加VETH设备,名称为LOCAL_IFNAME

  ip link add name "$LOCAL_IFNAME" mtu "$MTU" type veth peer name "$GUEST_IFNAME" mtu "$MTU"

  case "$BRTYPE" in

    linux)

      ### 两种方法将网桥和VETH连接起来

      (ip link set "$LOCAL_IFNAME" master "$IFNAME" > /dev/null 2>&1) || (brctl addif "$IFNAME" "$LOCAL_IFNAME")

      ;;

    openvswitch)

      if ! ovs-vsctl list-ports "$IFNAME" | grep -q "$LOCAL_IFNAME"; then

        ovs-vsctl add-port "$IFNAME" "$LOCAL_IFNAME" ${VLAN:+tag="$VLAN"}

      fi

      ;;

  esac

  ### 启动VETH

  ip link set "$LOCAL_IFNAME" up

}



# If it's a physical interface, create a macvlan subinterface

[ "$IFTYPE" = phys ] && {

  [ "$VLAN" ] && {

    [ ! -d "/sys/class/net/${IFNAME}.${VLAN}" ] && {

      ip link add link "$IFNAME" name "$IFNAME.$VLAN" mtu "$MTU" type vlan id "$VLAN"

    }

    ip link set "$IFNAME" up

    IFNAME=$IFNAME.$VLAN

  }

  GUEST_IFNAME=ph$NSPID$CONTAINER_IFNAME

  ip link add link "$IFNAME" dev "$GUEST_IFNAME" mtu "$MTU" type macvlan mode bridge

  ip link set "$IFNAME" up

}



# If it's an IPoIB interface, create a virtual IPoIB interface (the IPoIB

# equivalent of a macvlan device)

#

# Note: no macvlan subinterface nor Ethernet bridge can be created on top of an

# IPoIB interface. InfiniBand is not Ethernet. IPoIB is an IP layer on top of

# InfiniBand, without an intermediate Ethernet layer.

[ "$IFTYPE" = ipoib ] && {

  GUEST_IFNAME="${IFNAME}.${NSPID}"



  # If a partition key is provided, use it

  [ "$PKEY" ] && {

    GUEST_IFNAME="${IFNAME}.${PKEY}.${NSPID}"

    PKEY="pkey 0x$PKEY"

  }



  ip link add link "$IFNAME" name "$GUEST_IFNAME" type ipoib $PKEY

  ip link set "$IFNAME" up

}



### 设置VETH的另一端的net namespace和container一样

ip link set "$GUEST_IFNAME" netns "$NSPID"

### 在该net namespace下,给VETH的另一端重新命名为CONTAINER_IFNAME,默认情况下为eth1

ip netns exec "$NSPID" ip link set "$GUEST_IFNAME" name "$CONTAINER_IFNAME"

[ "$MACADDR" ] && ip netns exec "$NSPID" ip link set dev "$CONTAINER_IFNAME" address "$MACADDR"



# When using any of the DHCP methods, we start a DHCP client in the

# network namespace of the container. With the 'dhcp' method, the

# client used is taken from the Docker busybox image (therefore

# requiring no specific client installed on the host). Other methods

# use a locally installed client.

case "$DHCP_CLIENT" in

  dhcp)

    docker run -d --net container:$GUESTNAME --cap-add NET_ADMIN

           busybox udhcpc -i "$CONTAINER_IFNAME" -x "hostname:$GUESTNAME"

           $DHCP_OPTIONS

           >/dev/null

    ;;

  udhcpc)

    ip netns exec "$NSPID" "$DHCP_CLIENT" -qi "$CONTAINER_IFNAME"

                                          -x "hostname:$GUESTNAME"

                                          $DHCP_OPTIONS

    ;;

  dhclient)

    ip netns exec "$NSPID" "$DHCP_CLIENT" "$CONTAINER_IFNAME"

                                          -pf "/var/run/dhclient.$NSPID.pid"

                                          -H "$GUESTNAME"

                                          $DHCP_OPTIONS

    # kill dhclient after get ip address to prevent device be used after container close

    kill "$(cat "/var/run/dhclient.$NSPID.pid")"

    rm "/var/run/dhclient.$NSPID.pid"

    ;;

  dhcpcd)

    ip netns exec "$NSPID" "$DHCP_CLIENT" -q "$CONTAINER_IFNAME" -h "$GUESTNAME"

    ;;

  "")

    ### 添加新的ip

    ip netns exec "$NSPID" ip addr add "$IPADDR" dev "$CONTAINER_IFNAME"

    ### 先出默认的路由配置?

    [ "$GATEWAY" ] && {

      ip netns exec "$NSPID" ip route delete default >/dev/null 2>&1 && true

    }

    ip netns exec "$NSPID" ip link set "$CONTAINER_IFNAME" up

    ### 重新添加路由

    [ "$GATEWAY" ] && {

      ### ip route get命令和ip route show命令执行的操作是不同的。

      ### ip route show命令只是显示现有的路由,而ip route get命令在必要时会派生出新的路由。

      ip netns exec "$NSPID" ip route get "$GATEWAY" >/dev/null 2>&1 ||

      ip netns exec "$NSPID" ip route add "$GATEWAY/32" dev "$CONTAINER_IFNAME"

      ip netns exec "$NSPID" ip route replace default via "$GATEWAY"

    }

    ;;

esac



# Give our ARP neighbors a nudge about the new interface

if installed arping; then

  IPADDR=$(echo "$IPADDR" | cut -d/ -f1)

  ip netns exec "$NSPID" arping -c 1 -A -I "$CONTAINER_IFNAME" "$IPADDR" > /dev/null 2>&1 || true

else

  echo "Warning: arping not found; interface may not be immediately reachable"

fi



# Remove NSPID to avoid `ip netns` catch it.

rm -f "/var/run/netns/$NSPID"



# vim: set tabstop=2 shiftwidth=2 softtabstop=2 expandtab :