From 5f2233ea717f3f68e158cf4c54bb51b58802176e Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Thu, 20 Oct 2016 10:25:28 -0600 Subject: [PATCH] net: add support for macvlan link types While this is in principle similar to how veths are handled, we have to do things in two different ways depending on whether or not there is a user namespace involved, because there is no way to ask the kernel to attach a macvlan NIC to a device in a net ns that we don't have CAP_NET_ADMIN in. So we do it in two ways: a. If we are in a user namespace, we create the device in usernsd and use IFLA_NET_NS_FD to set the netns which it should be created in (saving us a "move into this netns" step). b. If we aren't in a user namespace, we could still be in a net namespace, so we use IFLA_LINK_NETNSID to set namespace that the i/o device will be in. Then we open a netlink socket from criu's netns and use IFLA_NET_NS_FD to tell the kernel to create the macvlan device in the target's namespace. v2: * s/CLONE_NEWNET/CLONE_NEWUSER * Don't bother to dump IFLA_LINK and IFLA_LINK_NETNSID. Although we need to provide these on restore, there's no kernel interface that persists these. To populate IFLA_LINK, we require users pass --macvlan-pair, and we create a NETNSID relation as needed and pass that in for macvlan links (although this infrastructure could be used elsewhere for links that need it in the future, since is in the hoisted populate_newlink_req()). * use new external command instead of creating a --macvlan-pair option v3: add a feature check for linux/net_namespace.h, since not every arch in travis has this (new-ish) header v4: * include sys/types.h instead of linux/if.h to get IFF_UP flag * remove old doc addition about --macvlan-pair option v5: define IFLA_LINK_NETNSID and RTM_NEWNSID if they don't exist v6: define IFLA_MACVLAN_FLAGS and bump the size of IFLA_MACVLAN_MAX when necessary v7: * remove unused struct macvlan_pair * split feature test for linux/net_namespace.h into separate patch * move IFLA_INFO_MAX testing in dump_one_netdev to the right patch * add documents for netwlink_extras fields * split changeflags into separate patch * use existing netnsid if we get EEXIST * move macvlan code to a helper function * use netnsid to restore in userns case, and not pid v8: * define RTM_GETNSID since we use that too now :) * don't bother with IFLA_MACVLAN_MAX; we only understand things up to IFLA_MACVLAN_FLAGS, so let's just use that as our max instead. The problem with using macros here, is that IFLA_MACLAN_MAX is defined as a macro with an enum expansion in it, so we get bitten by the enum not being available at preprocessing time, and implicit zero coercion when testing against its value for stuff. Yeesh. v10: * add some comments about when we set up NET_NS_FD and why we use IFLA_LINK and IFLA_NET_NS_ID * use the socket opened in restore_links() instead of opening one in restore_one_macvlan() * split the new argument to restore_one_link into its own patch travis-ci: success for series starting with [v10,01/11] net: pass the struct nlattrs to dump() functions Signed-off-by: Tycho Andersen Signed-off-by: Pavel Emelyanov --- criu/crtools.c | 1 + criu/external.c | 9 ++ criu/include/libnetlink.h | 4 + criu/include/net.h | 2 + criu/net.c | 264 ++++++++++++++++++++++++++++++++++++++ images/Makefile | 1 + images/macvlan.proto | 4 + images/netdev.proto | 4 + 8 files changed, 289 insertions(+) create mode 100644 images/macvlan.proto diff --git a/criu/crtools.c b/criu/crtools.c index 535ceaf19..c47bb8659 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -830,6 +830,7 @@ usage: " Formats of RES on restore:\n" " dev[VAL]:DEVPATH\n" " veth[IFNAME]:OUTNAME{@BRIDGE}\n" +" macvlan[IFNAME]:OUTNAME\n" "\n" "* Special resources support:\n" " --" SK_EST_PARAM " checkpoint/restore established TCP connections\n" diff --git a/criu/external.c b/criu/external.c index d920fcf2e..3dc9516b5 100644 --- a/criu/external.c +++ b/criu/external.c @@ -3,6 +3,9 @@ #include "cr_options.h" #include "xmalloc.h" #include "external.h" +#include "util.h" + +#include "net.h" int add_external(char *key) { @@ -12,6 +15,12 @@ int add_external(char *key) if (!ext) return -1; ext->id = key; + + if (strstartswith(key, "macvlan") && macvlan_ext_add(ext) < 0) { + xfree(ext); + return -1; + } + list_add(&ext->node, &opts.external); return 0; diff --git a/criu/include/libnetlink.h b/criu/include/libnetlink.h index 591af0e9e..0549ef984 100644 --- a/criu/include/libnetlink.h +++ b/criu/include/libnetlink.h @@ -13,5 +13,9 @@ extern int addattr_l(struct nlmsghdr *n, int maxlen, int type, #define NLMSG_TAIL(nmsg) \ ((struct rtattr *) (((void *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len))) +#ifndef NETNS_RTA +#define NETNS_RTA(r) \ + ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct rtgenmsg)))) +#endif #endif /* __CR_LIBNETLINK_H__ */ diff --git a/criu/include/net.h b/criu/include/net.h index f05fa9079..f6995582d 100644 --- a/criu/include/net.h +++ b/criu/include/net.h @@ -4,6 +4,7 @@ #include #include "common/list.h" +#include "external.h" struct cr_imgset; extern int dump_net_ns(int ns_id); @@ -30,6 +31,7 @@ extern int read_ns_sys_file(char *path, char *buf, int len); extern int restore_link_parms(NetDeviceEntry *nde, int nlsk); extern int veth_pair_add(char *in, char *out); +extern int macvlan_ext_add(struct external *ext); extern int move_veth_to_bridge(void); #endif /* __CR_NET_H__ */ diff --git a/criu/net.c b/criu/net.c index c2870e80c..641e32936 100644 --- a/criu/net.c +++ b/criu/net.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -10,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -35,6 +37,22 @@ #include "protobuf.h" #include "images/netdev.pb-c.h" +#ifndef IFLA_LINK_NETNSID +#define IFLA_LINK_NETNSID 37 +#endif + +#ifndef RTM_NEWNSID +#define RTM_NEWNSID 88 +#endif + +#ifndef RTM_GETNSID +#define RTM_GETNSID 90 +#endif + +#ifndef IFLA_MACVLAN_FLAGS +#define IFLA_MACVLAN_FLAGS 2 +#endif + static int ns_sysfs_fd = -1; int read_ns_sys_file(char *path, char *buf, int len) @@ -509,6 +527,37 @@ static int dump_bridge(NetDeviceEntry *nde, struct cr_imgset *imgset, struct nla return write_netdev_img(nde, imgset, info); } +static int dump_macvlan(NetDeviceEntry *nde, struct cr_imgset *imgset, struct nlattr **info) +{ + MacvlanLinkEntry macvlan = MACVLAN_LINK_ENTRY__INIT; + int ret; + struct nlattr *data[IFLA_MACVLAN_FLAGS+1]; + + if (!info || !info[IFLA_INFO_DATA]) { + pr_err("no data for macvlan\n"); + return -1; + } + + ret = nla_parse_nested(data, IFLA_MACVLAN_FLAGS, info[IFLA_INFO_DATA], NULL); + if (ret < 0) { + pr_err("failed ot parse macvlan data\n"); + return -1; + } + + if (!data[IFLA_MACVLAN_MODE]) { + pr_err("macvlan mode required for %s\n", nde->name); + return -1; + } + + macvlan.mode = *((u32 *)RTA_DATA(data[IFLA_MACVLAN_MODE])); + + if (data[IFLA_MACVLAN_FLAGS]) + macvlan.flags = *((u16 *) RTA_DATA(data[IFLA_MACVLAN_FLAGS])); + + nde->macvlan = &macvlan; + return write_netdev_img(nde, imgset, info); +} + static int dump_one_ethernet(struct ifinfomsg *ifi, char *kind, struct nlattr **tb, struct cr_imgset *fds) { @@ -541,6 +590,8 @@ static int dump_one_ethernet(struct ifinfomsg *ifi, char *kind, pr_warn("GRE tap device %s not supported natively\n", name); } + if (!strcmp(kind, "macvlan")) + return dump_one_netdev(ND_TYPE__MACVLAN, ifi, tb, fds, dump_macvlan); return dump_unknown_device(ifi, kind, tb, fds); } @@ -1027,6 +1078,206 @@ static int changeflags(int s, char *name, short flags) return 0; } +static int macvlan_link_info(NetDeviceEntry *nde, struct newlink_req *req) +{ + struct rtattr *macvlan_data; + MacvlanLinkEntry *macvlan = nde->macvlan; + + if (!macvlan) { + pr_err("Missing macvlan link entry %d\n", nde->ifindex); + return -1; + } + + addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "macvlan", 7); + + macvlan_data = NLMSG_TAIL(&req->h); + addattr_l(&req->h, sizeof(*req), IFLA_INFO_DATA, NULL, 0); + + addattr_l(&req->h, sizeof(*req), IFLA_MACVLAN_MODE, &macvlan->mode, sizeof(macvlan->mode)); + + if (macvlan->has_flags) + addattr_l(&req->h, sizeof(*req), IFLA_MACVLAN_FLAGS, &macvlan->flags, sizeof(macvlan->flags)); + + macvlan_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)macvlan_data; + + return 0; +} + +static int userns_restore_one_link(void *arg, int fd, pid_t pid) +{ + int nlsk, ret; + struct newlink_req *req = arg; + + nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (nlsk < 0) { + pr_perror("Can't create nlk socket"); + return -1; + } + + addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_FD, &fd, sizeof(fd)); + + ret = do_rtnl_req(nlsk, req, req->h.nlmsg_len, restore_link_cb, NULL, NULL); + close(nlsk); + return ret; +} + +static int get_nsid_cb(struct nlmsghdr *nlh, void *arg) +{ + struct rtgenmsg *rthdr; + struct rtattr *rta; + int len, *netnsid = arg; + + rthdr = NLMSG_DATA(nlh); + len = nlh->nlmsg_len - NLMSG_SPACE(sizeof(*rthdr)); + + if (len < 0) + return -1; + + rta = NETNS_RTA(rthdr); + + while (RTA_OK(rta, len)) { + if (rta->rta_type == NETNSA_NSID) + *netnsid = *((int *) RTA_DATA(rta)); + rta = RTA_NEXT(rta, len); + } + + if (netnsid < 0) { + pr_err("Didn't get a netnsid back from netlink?\n"); + return -1; + } + + return 0; +} + +static int get_criu_netnsid(int nlsk) +{ + static int netnsid = -1; + struct { + struct nlmsghdr n; + struct rtgenmsg g; + char buf[1024]; + } req; + int ns_fd = get_service_fd(NS_FD_OFF), i; + + if (netnsid > 0) + return netnsid; + + for (i = 0; i < 10; i++) { + int ret; + + memset(&req, 0, sizeof(req)); + + req.n.nlmsg_len = NLMSG_LENGTH(sizeof(req.g)); + req.n.nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK; + req.n.nlmsg_type = RTM_NEWNSID; + req.n.nlmsg_seq = CR_NLMSG_SEQ; + + addattr_l(&req.n, sizeof(req), NETNSA_FD, &ns_fd, sizeof(ns_fd)); + addattr_l(&req.n, sizeof(req), NETNSA_NSID, &i, sizeof(i)); + + ret = do_rtnl_req(nlsk, &req, req.n.nlmsg_len, NULL, NULL, NULL); + if (ret < 0) { + if (ret == -EEXIST) { + req.n.nlmsg_type = RTM_GETNSID; + ret = do_rtnl_req(nlsk, &req, req.n.nlmsg_len, get_nsid_cb, NULL, &netnsid); + if (ret < 0) { + pr_err("Couldn't get netnsid: %d\n", ret); + return -1; + } + + return netnsid; + } + errno = -ret; + pr_perror("couldn't create new netnsid"); + return -1; + } + + netnsid = i; + return netnsid; + } + + pr_err("tried to create too many netnsids\n"); + return -1; +} + +static int restore_one_macvlan(NetDeviceEntry *nde, int nlsk, int criu_nlsk) +{ + struct newlink_extras extras = { + .netns_id = -1, + .link = -1, + .target_netns = -1, + }; + char key[100], *val; + int my_netns = -1, ret = -1, s; + + snprintf(key, sizeof(key), "macvlan[%s]", nde->name); + val = external_lookup_data(key); + if (IS_ERR_OR_NULL(val)) { + pr_err("a macvlan parent for %s is required\n", nde->name); + return -1; + } + + /* link and netns_id are used to identify the master device to plug our + * macvlan slave into. We identify the destination via setting + * IFLA_NET_NS_FD to my_netns, but we have to do that in two different + * ways: in the userns case, we send the fd across to usernsd and set + * it there, whereas in the non-userns case we can just set it here, + * since we can just use a socket from criu's net ns given to us by + * restore_links(). We need to do this two different ways because + * CAP_NET_ADMIN is required in both namespaces, which we don't have in + * the userns case, and usernsd doesn't exist in the non-userns case. + */ + extras.link = (int) (unsigned long) val; + + extras.netns_id = get_criu_netnsid(nlsk); + if (extras.netns_id < 0) { + pr_err("failed to get criu's netnsid\n"); + return -1; + } + + my_netns = open_proc(PROC_SELF, "ns/net"); + if (my_netns < 0) { + pr_perror("couldn't get my netns"); + return -1; + } + + if (root_ns_mask & CLONE_NEWUSER) { + struct newlink_req req; + + if (populate_newlink_req(&req, RTM_NEWLINK, nde, macvlan_link_info, &extras) < 0) + goto out; + + if (userns_call(userns_restore_one_link, 0, &req, sizeof(req), my_netns) < 0) { + pr_err("couldn't restore macvlan interface %s via usernsd\n", nde->name); + goto out; + } + } else { + extras.target_netns = my_netns; + ret = restore_one_link(nde, criu_nlsk, macvlan_link_info, &extras); + if (ret < 0) + return -1; + } + + /* We have to change the flags of the NDE manually here because + * we used IFLA_LINK_NETNSID to restore it, which creates the + * device and then shuts it down when it changes the device's + * namespace, but doesn't start it back up when it goes to the + * other namespace. So, we restore its state here. + */ + s = socket(AF_LOCAL, SOCK_STREAM, 0); + if (s < 0) { + pr_perror("couldn't open socket for flag changing"); + goto out; + } + ret = changeflags(s, nde->name, nde->flags); + close(s); + +out: + if (my_netns >= 0) + close(my_netns); + return ret; +} + static int restore_link(NetDeviceEntry *nde, int nlsk, int criu_nlsk) { pr_info("Restoring link %s type %d\n", nde->name, nde->type); @@ -1043,6 +1294,8 @@ static int restore_link(NetDeviceEntry *nde, int nlsk, int criu_nlsk) return restore_one_tun(nde, nlsk); case ND_TYPE__BRIDGE: return restore_one_link(nde, nlsk, bridge_link_info, NULL); + case ND_TYPE__MACVLAN: + return restore_one_macvlan(nde, nlsk, criu_nlsk); default: pr_err("Unsupported link type %d\n", nde->type); break; @@ -1730,6 +1983,17 @@ int veth_pair_add(char *in, char *out) return add_external(e_str); } +int macvlan_ext_add(struct external *ext) +{ + ext->data = (void *) (unsigned long) if_nametoindex(external_val(ext)); + if (ext->data == 0) { + pr_perror("can't get ifindex of %s", ext->id); + return -1; + } + + return 0; +} + /* * The setns() syscall (called by switch_ns()) can be extremely * slow. If we call it two or more times from the same task the diff --git a/images/Makefile b/images/Makefile index cf50794cf..eb1852623 100644 --- a/images/Makefile +++ b/images/Makefile @@ -60,6 +60,7 @@ proto-obj-y += binfmt-misc.o proto-obj-y += time.o proto-obj-y += sysctl.o proto-obj-y += autofs.o +proto-obj-y += macvlan.o CFLAGS += -iquote $(obj)/ diff --git a/images/macvlan.proto b/images/macvlan.proto new file mode 100644 index 000000000..c9c90458e --- /dev/null +++ b/images/macvlan.proto @@ -0,0 +1,4 @@ +message macvlan_link_entry { + required uint32 mode = 1; + optional uint32 flags = 2; +} diff --git a/images/netdev.proto b/images/netdev.proto index 19b501c2f..2f2f3d132 100644 --- a/images/netdev.proto +++ b/images/netdev.proto @@ -1,5 +1,6 @@ syntax = "proto2"; +import "macvlan.proto"; import "opts.proto"; import "tun.proto"; import "sysctl.proto"; @@ -20,6 +21,7 @@ enum nd_type { */ VENET = 5; BRIDGE = 6; + MACVLAN = 7; } message net_device_entry { @@ -38,6 +40,8 @@ message net_device_entry { repeated sysctl_entry conf4 = 9; repeated sysctl_entry conf6 = 10; + + optional macvlan_link_entry macvlan = 11; } message netns_entry {