From f3a43179c8c9ebdf8e5d70f8842c1c8f40aa52f7 Mon Sep 17 00:00:00 2001 From: Stephanie Wang Date: Fri, 28 Oct 2016 11:56:16 -0700 Subject: [PATCH] Unit testing for the plasma manager. --- .travis.yml | 6 + src/plasma/Makefile | 17 +- src/plasma/plasma_client.c | 41 ++++- src/plasma/plasma_client.h | 15 +- src/plasma/plasma_manager.c | 195 ++++++++++++----------- src/plasma/plasma_manager.h | 173 ++++++++++++++++++++- src/plasma/plasma_store.c | 4 +- src/plasma/test/manager_tests.c | 265 ++++++++++++++++++++++++++++++++ src/plasma/test/test.py | 3 - 9 files changed, 609 insertions(+), 110 deletions(-) create mode 100644 src/plasma/test/manager_tests.c diff --git a/.travis.yml b/.travis.yml index 6cdb5ddcaa72..0ca1054682df 100644 --- a/.travis.yml +++ b/.travis.yml @@ -37,6 +37,12 @@ matrix: - sudo apt-get update -qq - sudo apt-get install -qq valgrind script: + - cd src/plasma + - make valgrind + - cd ../.. + + - python src/plasma/test/test.py valgrind + - python src/photon/test/test.py valgrind install: diff --git a/src/plasma/Makefile b/src/plasma/Makefile index ac78f53ce172..5a23523dc532 100644 --- a/src/plasma/Makefile +++ b/src/plasma/Makefile @@ -1,5 +1,6 @@ CC = gcc CFLAGS = -g -Wall --std=c99 -D_XOPEN_SOURCE=500 -D_POSIX_C_SOURCE=200809L -I. -I../common -I../common/thirdparty +TEST_CFLAGS = -DPLASMA_TEST=1 -I. BUILD = build all: $(BUILD)/plasma_store $(BUILD)/plasma_manager $(BUILD)/plasma_client.so $(BUILD)/example $(BUILD)/libplasma_client.a @@ -12,6 +13,9 @@ clean: cd ../common; make clean rm -r $(BUILD)/* +$(BUILD)/manager_tests: test/manager_tests.c plasma.h plasma_client.h plasma_client.c plasma_manager.h plasma_manager.c fling.h fling.c common + $(CC) $(CFLAGS) $(TEST_CFLAGS) -o $@ test/manager_tests.c plasma_manager.c plasma_client.c fling.c ../common/build/libcommon.a ../common/thirdparty/hiredis/libhiredis.a + $(BUILD)/plasma_store: plasma_store.c plasma.h fling.h fling.c malloc.c malloc.h thirdparty/dlmalloc.c common $(CC) $(CFLAGS) plasma_store.c fling.c malloc.c ../common/build/libcommon.a -o $(BUILD)/plasma_store @@ -28,12 +32,19 @@ $(BUILD)/example: plasma_client.c plasma.h example.c fling.h fling.c common $(CC) $(CFLAGS) plasma_client.c example.c fling.c ../common/build/libcommon.a -o $(BUILD)/example common: FORCE - cd ../common; make + git submodule update --init --recursive + cd ../common; make # Set the request timeout low for testing purposes. test: CFLAGS += -DRAY_TIMEOUT=50 -test: FORCE - cd ../common; make redis +# First, build and run all the unit tests. +test: $(BUILD)/manager_tests FORCE + ./build/manager_tests + cd ../common; make redis +# Next, build all the executables for Python testing. test: all +valgrind: test + valgrind --leak-check=full --error-exitcode=1 ./build/manager_tests + FORCE: diff --git a/src/plasma/plasma_client.c b/src/plasma/plasma_client.c index 68196737f4a6..d79f9ab67bf6 100644 --- a/src/plasma/plasma_client.c +++ b/src/plasma/plasma_client.c @@ -20,6 +20,9 @@ #include "fling.h" #include "uthash.h" +/* Number of times we try connecting to a socket. */ +#define NUM_CONNECT_ATTEMPTS 50 + typedef struct { /** Key that uniquely identifies the memory mapped file. In practice, we * take the numerical value of the file descriptor in the object store. */ @@ -311,7 +314,8 @@ plasma_connection *plasma_connect(const char *store_socket_name, */ int fd = -1; int connected_successfully = 0; - for (int num_attempts = 0; num_attempts < 50; ++num_attempts) { + for (int num_attempts = 0; num_attempts < NUM_CONNECT_ATTEMPTS; + ++num_attempts) { fd = connect_ipc_sock(store_socket_name); if (fd >= 0) { connected_successfully = 1; @@ -330,6 +334,10 @@ plasma_connection *plasma_connect(const char *store_socket_name, result->store_conn = fd; if (manager_addr != NULL) { result->manager_conn = plasma_manager_connect(manager_addr, manager_port); + if (result->manager_conn < 0) { + LOG_ERR("Could not connect to Plasma manager %s:%d", manager_addr, + manager_port); + } } else { result->manager_conn = -1; } @@ -348,18 +356,17 @@ void plasma_disconnect(plasma_connection *conn) { #define h_addr h_addr_list[0] -/* TODO(swang): Return the error to the caller. */ -int plasma_manager_connect(const char *ip_addr, int port) { +int plasma_manager_try_connect(const char *ip_addr, int port) { int fd = socket(PF_INET, SOCK_STREAM, 0); if (fd < 0) { LOG_ERR("could not create socket"); - exit(-1); + return -1; } struct hostent *manager = gethostbyname(ip_addr); /* TODO(pcm): cache this */ if (!manager) { LOG_ERR("plasma manager %s not found", ip_addr); - exit(-1); + return -1; } struct sockaddr_in addr; @@ -370,10 +377,26 @@ int plasma_manager_connect(const char *ip_addr, int port) { int r = connect(fd, (struct sockaddr *) &addr, sizeof(addr)); if (r < 0) { LOG_ERR( - "could not establish connection to manager with id %s:%d (probably ran " + "could not establish connection to manager with id %s:%d (may have run " "out of ports)", &ip_addr[0], port); - exit(-1); + return -1; + } + return fd; +} + +int plasma_manager_connect(const char *ip_addr, int port) { + /* Try to connect to the Plasma manager. If unsuccessful, retry several times. + */ + int fd = -1; + for (int num_attempts = 0; num_attempts < NUM_CONNECT_ATTEMPTS; + ++num_attempts) { + fd = plasma_manager_try_connect(ip_addr, port); + if (fd >= 0) { + break; + } + /* Sleep for 100 milliseconds. */ + usleep(100000); } return fd; } @@ -432,3 +455,7 @@ void plasma_fetch(plasma_connection *conn, "Received unexpected object ID from manager during fetch."); } } + +int get_manager_fd(plasma_connection *conn) { + return conn->manager_conn; +} diff --git a/src/plasma/plasma_client.h b/src/plasma/plasma_client.h index ca64b826fc4d..913c34fb0017 100644 --- a/src/plasma/plasma_client.h +++ b/src/plasma/plasma_client.h @@ -62,11 +62,13 @@ plasma_connection *plasma_connect(const char *store_socket_name, void plasma_disconnect(plasma_connection *conn); /** - * Connect to a possibly remote Plasma Manager. + * Try to connect to a possibly remote Plasma Manager. * * @param addr The IP address of the Plasma Manager to connect to. * @param port The port of the Plasma Manager to connect to. - * @return The file descriptor to use to send messages to the Plasma Manager. + * @return The file descriptor to use to send messages to the + * Plasma Manager. If connection was unsuccessful, this + * value is -1. */ int plasma_manager_connect(const char *addr, int port); @@ -195,4 +197,13 @@ void plasma_fetch(plasma_connection *conn, */ int plasma_subscribe(plasma_connection *conn); +/** + * Get the file descriptor for the socket connection to the plasma manager. + * + * @param conn The plasma connection. + * @return The file descriptor for the manager connection. If there is no + * connection to the manager, this is -1. + */ +int get_manager_fd(plasma_connection *conn); + #endif diff --git a/src/plasma/plasma_manager.c b/src/plasma/plasma_manager.c index cefb297fb870..bcdd61973775 100644 --- a/src/plasma/plasma_manager.c +++ b/src/plasma/plasma_manager.c @@ -34,18 +34,9 @@ #include "state/db.h" #include "state/object_table.h" -#define NUM_RETRIES 5 - -/* Timeouts are in milliseconds. */ -#ifndef RAY_TIMEOUT -#define MANAGER_TIMEOUT 1000 -#else -#define MANAGER_TIMEOUT RAY_TIMEOUT -#endif - typedef struct client_object_connection client_object_connection; -typedef struct { +struct plasma_manager_state { /** Event loop. */ event_loop *loop; /** Connection to the local plasma store for reading or writing data. */ @@ -63,26 +54,10 @@ typedef struct { * object id, value is a list of connections to the clients * who are blocking on a fetch of this object. */ client_object_connection *fetch_connections; -} plasma_manager_state; +}; plasma_manager_state *g_manager_state = NULL; -typedef struct plasma_request_buffer plasma_request_buffer; - -/* Buffer for requests between plasma managers. */ -struct plasma_request_buffer { - int type; - object_id object_id; - uint8_t *data; - int64_t data_size; - uint8_t *metadata; - int64_t metadata_size; - /* Pointer to the next buffer that we will write to this plasma manager. This - * field is only used if we're pushing requests to another plasma manager, - * not if we are receiving data. */ - plasma_request_buffer *next; -}; - /* The context for fetch and wait requests. These are per client, per object. */ struct client_object_connection { /** The ID of the object we are fetching or waiting for. */ @@ -103,6 +78,10 @@ struct client_object_connection { char **manager_vector; /** The number of manager locations in the array manager_vector. */ int manager_count; + /** The next manager we should try to contact. This is set to an index in + * manager_vector in the retry handler, in case the current attempt fails to + * contact a manager. */ + int next_manager; /** Handle for the uthash table in the client connection * context that keeps track of active object connection * contexts. */ @@ -138,7 +117,7 @@ struct client_connection { * string
: as an identifier. */ char *ip_addr_port; /** Handle for the uthash table. */ - UT_hash_handle hh; + UT_hash_handle manager_hh; }; void free_client_object_connection(client_object_connection *object_conn) { @@ -150,8 +129,8 @@ void free_client_object_connection(client_object_connection *object_conn) { } int send_client_reply(client_connection *conn, plasma_reply *reply) { - conn->num_return_objects--; CHECK(conn->num_return_objects >= 0); + --conn->num_return_objects; /* TODO(swang): Handle errors in write. */ int n = write(conn->fd, (uint8_t *) reply, sizeof(plasma_reply)); return (n != sizeof(plasma_reply)); @@ -174,17 +153,6 @@ client_object_connection *get_object_connection(client_connection *client_conn, return object_conn; } -/** - * Create a new context for the given object ID with the given - * client connection and register it with the manager's - * outstanding fetch or wait requests and the client - * connection's active object contexts. - * - * @param client_conn The client connection context. - * @param object_id The object ID whose context we want to - * create. - * @return A pointer to the newly created object context. - */ client_object_connection *add_object_connection(client_connection *client_conn, object_id object_id) { /* TODO(swang): Support registration of wait operations. */ @@ -198,6 +166,7 @@ client_object_connection *add_object_connection(client_connection *client_conn, object_conn->client_conn = client_conn; object_conn->manager_count = 0; object_conn->manager_vector = NULL; + object_conn->next_manager = 0; /* Register the object context with the client context. */ HASH_ADD(active_hh, client_conn->active_objects, object_id, sizeof(object_id), object_conn); @@ -217,13 +186,6 @@ client_object_connection *add_object_connection(client_connection *client_conn, return object_conn; } -/** - * Clean up and free an active object context. Deregister it from the - * associated client connection and from the manager state. - * - * @param client_conn The client connection context. - * @param object_id The object ID whose context we want to delete. - */ void remove_object_connection(client_connection *client_conn, client_object_connection *object_conn) { /* Deregister the object context with the client context. */ @@ -283,6 +245,33 @@ plasma_manager_state *init_plasma_manager_state(const char *store_socket_name, return state; } +void destroy_plasma_manager_state(plasma_manager_state *state) { + client_connection *manager_conn, *tmp; + HASH_ITER(manager_hh, state->manager_connections, manager_conn, tmp) { + HASH_DELETE(manager_hh, state->manager_connections, manager_conn); + plasma_request_buffer *head = manager_conn->transfer_queue; + while (head) { + LL_DELETE(manager_conn->transfer_queue, head); + free(head); + head = manager_conn->transfer_queue; + } + close(manager_conn->fd); + free(manager_conn->ip_addr_port); + free(manager_conn); + } + /* There should not be any outstanding client connections if + * we're shutting down. */ + CHECK(state->fetch_connections == NULL); + + free(state->plasma_conn); + event_loop_destroy(state->loop); + free(state); +} + +event_loop *get_event_loop(plasma_manager_state *state) { + return state->loop; +} + /* Handle a command request that came in through a socket (transfering data, * or accepting incoming data). */ void process_message(event_loop *loop, @@ -291,6 +280,7 @@ void process_message(event_loop *loop, int events); void write_object_chunk(client_connection *conn, plasma_request_buffer *buf) { + LOG_DEBUG("Writing data to fd %d", conn->fd); ssize_t r, s; /* Try to write one BUFSIZE at a time. */ s = buf->data_size + buf->metadata_size - conn->cursor; @@ -365,21 +355,17 @@ void send_queued_request(event_loop *loop, } } -void process_data_chunk(event_loop *loop, - int data_sock, - void *context, - int events) { - LOG_DEBUG("Reading data"); +int read_object_chunk(client_connection *conn, plasma_request_buffer *buf) { + LOG_DEBUG("Reading data from fd %d to %p", conn->fd, + buf->data + conn->cursor); ssize_t r, s; - client_connection *conn = (client_connection *) context; - plasma_request_buffer *buf = conn->transfer_queue; CHECK(buf != NULL); /* Try to read one BUFSIZE at a time. */ s = buf->data_size + buf->metadata_size - conn->cursor; if (s > BUFSIZE) { s = BUFSIZE; } - r = read(data_sock, buf->data + conn->cursor, s); + r = read(conn->fd, buf->data + conn->cursor, s); if (r == -1) { LOG_ERR("read error"); @@ -388,10 +374,25 @@ void process_data_chunk(event_loop *loop, } else { conn->cursor += r; } + /* If the cursor is equal to the full object size, reset the cursor and we're + * done. */ + if (conn->cursor == buf->data_size + buf->metadata_size) { + conn->cursor = 0; + return 1; + } else { + return 0; + } +} - if (conn->cursor != buf->data_size + buf->metadata_size) { - /* If we haven't finished reading all the data for this object yet, we're - * done for now. */ +void process_data_chunk(event_loop *loop, + int data_sock, + void *context, + int events) { + /* Read the object chunk. */ + client_connection *conn = (client_connection *) context; + plasma_request_buffer *buf = conn->transfer_queue; + int done = read_object_chunk(conn, buf); + if (!done) { return; } @@ -433,19 +434,24 @@ client_connection *get_manager_connection(plasma_manager_state *state, utstring_new(ip_addr_port); utstring_printf(ip_addr_port, "%s:%d", ip_addr, port); client_connection *manager_conn; - HASH_FIND_STR(state->manager_connections, utstring_body(ip_addr_port), - manager_conn); + HASH_FIND(manager_hh, state->manager_connections, utstring_body(ip_addr_port), + utstring_len(ip_addr_port), manager_conn); LOG_DEBUG("Getting manager connection to %s on DB client %d", utstring_body(ip_addr_port), get_client_id(state->db)); if (!manager_conn) { /* If we don't already have a connection to this manager, start one. */ + int fd = plasma_manager_connect(ip_addr, port); + /* TODO(swang): Handle the case when connection to this manager was + * unsuccessful. */ + CHECK(fd >= 0); manager_conn = malloc(sizeof(client_connection)); - manager_conn->fd = plasma_manager_connect(ip_addr, port); + manager_conn->fd = fd; manager_conn->manager_state = state; manager_conn->transfer_queue = NULL; manager_conn->cursor = 0; manager_conn->ip_addr_port = strdup(utstring_body(ip_addr_port)); - HASH_ADD_KEYPTR(hh, manager_conn->manager_state->manager_connections, + HASH_ADD_KEYPTR(manager_hh, + manager_conn->manager_state->manager_connections, manager_conn->ip_addr_port, strlen(manager_conn->ip_addr_port), manager_conn); } @@ -510,7 +516,7 @@ void process_data_request(event_loop *loop, plasma_create(conn->manager_state->plasma_conn, object_id, data_size, NULL, metadata_size, &(buf->data)); LL_APPEND(conn->transfer_queue, buf); - conn->cursor = 0; + CHECK(conn->cursor == 0); /* Switch to reading the data from this socket, instead of listening for * other requests. */ @@ -534,10 +540,12 @@ void request_transfer_from(client_connection *client_conn, get_object_connection(client_conn, object_id); CHECK(object_conn); CHECK(object_conn->manager_count > 0); + CHECK(object_conn->next_manager >= 0 && + object_conn->next_manager < object_conn->manager_count); char addr[16]; int port; - int i = object_conn->num_retries % object_conn->manager_count; - parse_ip_addr_port(object_conn->manager_vector[i], addr, &port); + parse_ip_addr_port(object_conn->manager_vector[object_conn->next_manager], + addr, &port); client_connection *manager_conn = get_manager_connection(client_conn->manager_state, addr, port); @@ -554,6 +562,9 @@ void request_transfer_from(client_connection *client_conn, } /* Add this transfer request to this connection's transfer queue. */ LL_APPEND(manager_conn->transfer_queue, transfer_request); + /* On the next attempt, try the next manager in manager_vector. */ + ++object_conn->next_manager; + object_conn->next_manager %= object_conn->manager_count; } int manager_timeout_handler(event_loop *loop, timer_id id, void *context) { @@ -571,19 +582,9 @@ int manager_timeout_handler(event_loop *loop, timer_id id, void *context) { return AE_NOMORE; } -/** - * Given an object ID and the managers it can be found on, start requesting a - * transfer from the managers. - * - * @param object_id The object ID we want to request a transfer of. - * @param manager_count The number of managers the object can be found on. - * @param manager_vector A vector of the IP addresses of the managers that the - * object can be found on. - * @param context The context for the connection to this client. - * - * Initializes a new context for this client and object. Managers are tried in - * order until we receive the data or we timeout and run out of retries. - */ +/* TODO(swang): Consolidate transfer requests for same object + * from different client IDs by passing in manager state, not + * client context. */ void request_transfer(object_id object_id, int manager_count, const char *manager_vector[], @@ -640,6 +641,8 @@ void process_fetch_request(client_connection *client_conn, return; } /* Register the new context with the current client connection. */ + /* TODO(swang): If there is already an outstanding fetch request for this + * object, exit now. */ client_object_connection *object_conn = add_object_connection(client_conn, object_id); if (!object_conn) { @@ -656,7 +659,7 @@ void process_fetch_requests(client_connection *client_conn, int num_object_ids, object_id object_ids[]) { for (int i = 0; i < num_object_ids; ++i) { - client_conn->num_return_objects++; + ++client_conn->num_return_objects; process_fetch_request(client_conn, object_ids[i]); } } @@ -707,20 +710,33 @@ void process_message(event_loop *loop, free(req); } -void new_client_connection(event_loop *loop, - int listener_sock, - void *context, - int events) { +client_connection *new_client_connection(event_loop *loop, + int listener_sock, + void *context, + int events) { int new_socket = accept_client(listener_sock); /* Create a new data connection context per client. */ client_connection *conn = malloc(sizeof(client_connection)); conn->manager_state = (plasma_manager_state *) context; + conn->cursor = 0; conn->transfer_queue = NULL; conn->fd = new_socket; conn->active_objects = NULL; conn->num_return_objects = 0; event_loop_add_file(loop, new_socket, EVENT_LOOP_READ, process_message, conn); LOG_DEBUG("New plasma manager connection with fd %d", new_socket); + return conn; +} + +void handle_new_client(event_loop *loop, + int listener_sock, + void *context, + int events) { + (void) new_client_connection(loop, listener_sock, context, events); +} + +int get_client_sock(client_connection *conn) { + return conn->fd; } void start_server(const char *store_socket_name, @@ -728,16 +744,17 @@ void start_server(const char *store_socket_name, int port, const char *db_addr, int db_port) { - int sock = bind_inet_sock(port); - CHECKM(sock >= 0, "Unable to bind to manager port"); - g_manager_state = init_plasma_manager_state(store_socket_name, master_addr, port, db_addr, db_port); CHECK(g_manager_state); + + int sock = bind_inet_sock(port); + CHECKM(sock >= 0, "Unable to bind to manager port"); + LOG_DEBUG("Started server connected to store %s, listening on port %d", store_socket_name, port); event_loop_add_file(g_manager_state->loop, sock, EVENT_LOOP_READ, - new_client_connection, g_manager_state); + handle_new_client, g_manager_state); event_loop_run(g_manager_state->loop); } @@ -751,6 +768,9 @@ void signal_handler(int signal) { } } +/* Only declare the main function if we are not in testing mode, since the test + * suite has its own declaration of main. */ +#ifndef PLASMA_TEST int main(int argc, char *argv[]) { signal(SIGTERM, signal_handler); /* Socket name of the plasma store this manager is connected to. */ @@ -802,3 +822,4 @@ int main(int argc, char *argv[]) { start_server(store_socket_name, master_addr, port, NULL, 0); } } +#endif diff --git a/src/plasma/plasma_manager.h b/src/plasma/plasma_manager.h index 368b1314e57b..368dfb05dd40 100644 --- a/src/plasma/plasma_manager.h +++ b/src/plasma/plasma_manager.h @@ -4,7 +4,55 @@ #include #include "utarray.h" +#ifndef RAY_NUM_RETRIES +#define NUM_RETRIES 5 +#else +#define NUM_RETRIES RAY_NUM_RETRIES +#endif + +/* Timeouts are in milliseconds. */ +#ifndef RAY_TIMEOUT +#define MANAGER_TIMEOUT 1000 +#else +#define MANAGER_TIMEOUT RAY_TIMEOUT +#endif + +/* The buffer size in bytes. Data will get transfered in multiples of this */ +#define BUFSIZE 4096 + +typedef struct plasma_manager_state plasma_manager_state; typedef struct client_connection client_connection; +typedef struct client_object_connection client_object_connection; + +/** + * Initializes the plasma manager state. This connects the manager to the local + * plasma store, starts the manager listening for client connections, and + * connects the manager to a database if there is one. The returned manager + * state should be freed using the provided destroy_plasma_manager_state + * function. + * + * @param store_socket_name The socket name used to connect to the local store. + * @param manager_addr Our IP address. + * @param manager_port The IP port that we listen on. + * @param db_addr The IP address of the database to connect to. If this is + * NULL, then the manager will be initialized without a database + * connection. + * @param db_port The IP port of the database to connect to. + * @return A pointer to the initialized plasma manager state. + */ +plasma_manager_state *init_plasma_manager_state(const char *store_socket_name, + const char *manager_addr, + int manager_port, + const char *db_addr, + int db_port); + +/** + * Destroys the plasma manager state and its connections. + * + * @param state A pointer to the plasma manager state to destroy. + * @return Void. + */ +void destroy_plasma_manager_state(plasma_manager_state *state); /** * Process a request from another object store manager to transfer an object. @@ -124,12 +172,125 @@ void send_queued_request(event_loop *loop, * @param context The plasma manager state. * @return Void. */ -void new_client_connection(event_loop *loop, - int listener_sock, - void *context, - int events); +client_connection *new_client_connection(event_loop *loop, + int listener_sock, + void *context, + int events); -/* The buffer size in bytes. Data will get transfered in multiples of this */ -#define BUFSIZE 4096 +/** + * The following definitions are internal to the plasma manager code but are + * needed by the unit tests in test/manager_tests.c. This includes structs + * instantiated by the unit tests and forward declarations for functions used + * internally by the plasma manager code. + */ + +/* Buffer for requests between plasma managers. */ +typedef struct plasma_request_buffer plasma_request_buffer; +struct plasma_request_buffer { + int type; + object_id object_id; + uint8_t *data; + int64_t data_size; + uint8_t *metadata; + int64_t metadata_size; + /* Pointer to the next buffer that we will write to this plasma manager. This + * field is only used if we're pushing requests to another plasma manager, + * not if we are receiving data. */ + plasma_request_buffer *next; +}; + +/** + * Create a new context for the given object ID with the given + * client connection and register it with the manager's + * outstanding fetch or wait requests and the client + * connection's active object contexts. + * + * @param client_conn The client connection context. + * @param object_id The object ID whose context we want to + * create. + * @return A pointer to the newly created object context. + */ +client_object_connection *add_object_connection(client_connection *client_conn, + object_id object_id); + +/** + * Given an object ID and the managers it can be found on, start requesting a + * transfer from the managers. + * + * @param object_id The object ID we want to request a transfer of. + * @param manager_count The number of managers the object can be found on. + * @param manager_vector A vector of the IP addresses of the managers that the + * object can be found on. + * @param context The context for the connection to this client. + * + * Initializes a new context for this client and object. Managers are tried in + * order until we receive the data or we timeout and run out of retries. + */ +void request_transfer(object_id object_id, + int manager_count, + const char *manager_vector[], + void *context); + +/** + * Clean up and free an active object context. Deregister it from the + * associated client connection and from the manager state. + * + * @param client_conn The client connection context. + * @param object_id The object ID whose context we want to delete. + */ +void remove_object_connection(client_connection *client_conn, + client_object_connection *object_conn); + +/** + * Get a connection to the remote manager at the specified address. Creates a + * new connection to this manager if one doesn't already exist. + * + * @param state Our plasma manager state. + * @param ip_addr The IP address of the remote manager we want to connect to. + * @param port The port that the remote manager is listening on. + * @return A pointer to the connection to the remote manager. + */ +client_connection *get_manager_connection(plasma_manager_state *state, + const char *ip_addr, + int port); + +/** + * Reads an object chunk sent by the given client into a buffer. This is the + * complement to write_object_chunk. + * + * @param conn The connection to the client who's sending the data. + * @param buf The buffer to write the data into. + * @return An integer representing whether the client is done + * sending this object. 1 means that the client has + * sent all the data, 0 means there is more. + */ +int read_object_chunk(client_connection *conn, plasma_request_buffer *buf); + +/** + * Writes an object chunk from a buffer to the given client. This is the + * complement to read_object_chunk. + * + * @param conn The connection to the client who's receiving the data. + * @param buf The buffer to read data from. + * @return Void. + */ +void write_object_chunk(client_connection *conn, plasma_request_buffer *buf); + +/** + * Get the event loop of the given plasma manager state. + * + * @param state The state of the plasma manager whose loop we want. + * @return A pointer to the manager's event loop. + */ +event_loop *get_event_loop(plasma_manager_state *state); + +/** + * Get the file descriptor for the given client's socket. This is the socket + * that the client sends or reads data through. + * + * @param conn The connection to the client who's sending or reading data. + * @return A file descriptor for the socket. + */ +int get_client_sock(client_connection *conn); #endif /* PLASMA_MANAGER_H */ diff --git a/src/plasma/plasma_store.c b/src/plasma/plasma_store.c index cfcbbcfaa052..c6702c3cf0a6 100644 --- a/src/plasma/plasma_store.c +++ b/src/plasma/plasma_store.c @@ -495,10 +495,10 @@ void signal_handler(int signal) { } void start_server(char *socket_name) { - int socket = bind_ipc_sock(socket_name); - CHECK(socket >= 0); event_loop *loop = event_loop_create(); plasma_store_state *state = init_plasma_store(loop); + int socket = bind_ipc_sock(socket_name); + CHECK(socket >= 0); event_loop_add_file(loop, socket, EVENT_LOOP_READ, new_client_connection, state); event_loop_run(loop); diff --git a/src/plasma/test/manager_tests.c b/src/plasma/test/manager_tests.c new file mode 100644 index 000000000000..a1d51d05f5e1 --- /dev/null +++ b/src/plasma/test/manager_tests.c @@ -0,0 +1,265 @@ +#include "greatest.h" + +#include +#include +#include +#include + +#include "common.h" +#include "event_loop.h" +#include "io.h" + +#include "plasma.h" +#include "plasma_client.h" +#include "plasma_manager.h" + +SUITE(plasma_manager_tests); + +const char *manager_addr = "127.0.0.1"; +int manager_port = 12345; +const char *store_socket_name = "/tmp/store12345"; +object_id oid; + +int test_done_handler(event_loop *loop, timer_id id, void *context) { + event_loop_stop(loop); + return AE_NOMORE; +} + +typedef struct { + int port; + int manager_fd; + int local_store; + plasma_manager_state *state; + event_loop *loop; + /* Accept a connection from the local manager on the remote manager. */ + client_connection *write_conn; + client_connection *read_conn; + /* Connect a new client to the local plasma manager and mock a request to an + * object. */ + plasma_connection *plasma_conn; + client_connection *client_conn; + client_object_connection *object_conn; +} plasma_mock; + +plasma_mock *init_plasma_mock(int port, plasma_mock *remote_mock) { + plasma_mock *mock = malloc(sizeof(plasma_mock)); + /* Start listening on all the ports and initiate the local plasma manager. */ + mock->port = port; + mock->manager_fd = bind_inet_sock(port); + mock->local_store = bind_ipc_sock(store_socket_name); + mock->state = + init_plasma_manager_state(store_socket_name, manager_addr, port, NULL, 0); + mock->loop = get_event_loop(mock->state); + /* Accept a connection from the local manager on the remote manager. */ + if (remote_mock != NULL) { + mock->write_conn = + get_manager_connection(remote_mock->state, manager_addr, port); + mock->read_conn = + new_client_connection(mock->loop, mock->manager_fd, mock->state, 0); + } else { + mock->write_conn = NULL; + mock->read_conn = NULL; + } + + mock->plasma_conn = NULL; + mock->client_conn = NULL; + mock->object_conn = NULL; + return mock; +} + +void add_mock_object_conn(plasma_mock *mock, object_id oid) { + /* Connect a new client to the local plasma manager and mock a request to an + * object. */ + mock->plasma_conn = + plasma_connect(store_socket_name, manager_addr, mock->port); + mock->client_conn = + new_client_connection(mock->loop, mock->manager_fd, mock->state, 0); + mock->object_conn = add_object_connection(mock->client_conn, oid); +} + +void destroy_plasma_mock(plasma_mock *mock) { + if (mock->object_conn != NULL) { + free(mock->client_conn); + free(mock->plasma_conn); + } + if (mock->read_conn != NULL) { + close(get_client_sock(mock->read_conn)); + free(mock->read_conn); + } + destroy_plasma_manager_state(mock->state); + close(mock->local_store); + close(mock->manager_fd); + free(mock); +} + +/** + * This test checks correct behavior of request_transfer in a non-failure + * scenario. Specifically, when one plasma manager calls request_transfer, the + * correct remote manager should receive the correct message. The test: + * - Buffer a transfer request for the remote manager. + * - Start and stop the event loop to make sure that we send the buffered + * request. + * - Expect to see a PLASMA_TRANSFER message on the remote manager with the + * correct object ID. + */ +TEST request_transfer_test(void) { + plasma_mock *local_mock = init_plasma_mock(manager_port, NULL); + add_mock_object_conn(local_mock, oid); + plasma_mock *remote_mock = init_plasma_mock(12346, local_mock); + const char **manager_vector = malloc(sizeof(char *)); + manager_vector[0] = "127.0.0.1:12346"; + request_transfer(oid, 1, manager_vector, local_mock->client_conn); + event_loop_add_timer(local_mock->loop, MANAGER_TIMEOUT, test_done_handler, + local_mock->state); + event_loop_run(local_mock->loop); + int64_t type; + int64_t length; + plasma_request *req; + int read_fd = get_client_sock(remote_mock->read_conn); + read_message(read_fd, &type, &length, (uint8_t **) &req); + ASSERT(type == PLASMA_TRANSFER); + ASSERT(req->num_object_ids == 1); + ASSERT(memcmp(&oid, &req->object_ids[0], sizeof(object_id)) == 0); + /* Clean up. */ + free(req); + destroy_plasma_mock(remote_mock); + remove_object_connection(local_mock->client_conn, local_mock->object_conn); + destroy_plasma_mock(local_mock); + PASS(); +} + +/** + * This test checks correct behavior of request_transfer in a scenario when the + * first manager we try times out. Specifically, when one plasma manager calls + * request_transfer on a list of remote managers and the first manager isn't + * reachable, the second remote manager should receive the correct message + * after the timeout. The test: + * - Buffer a transfer request for the remote managers. + * - Start and stop the event loop after a timeout to make sure that we + * trigger the timeout on the first manager. + * - Expect to see a PLASMA_TRANSFER message on the second remote manager + * with the correct object ID. + */ +TEST request_transfer_retry_test(void) { + plasma_mock *local_mock = init_plasma_mock(manager_port, NULL); + add_mock_object_conn(local_mock, oid); + plasma_mock *remote_mock1 = init_plasma_mock(12346, local_mock); + plasma_mock *remote_mock2 = init_plasma_mock(12347, local_mock); + const char **manager_vector = malloc(sizeof(char *) * 2); + manager_vector[0] = "127.0.0.1:12346"; + manager_vector[1] = "127.0.0.1:12347"; + request_transfer(oid, 2, manager_vector, local_mock->client_conn); + event_loop_add_timer(local_mock->loop, MANAGER_TIMEOUT * 2, test_done_handler, + local_mock->state); + event_loop_run(local_mock->loop); + + int64_t type; + int64_t length; + plasma_request *req; + int read_fd = get_client_sock(remote_mock2->read_conn); + read_message(read_fd, &type, &length, (uint8_t **) &req); + ASSERT(type == PLASMA_TRANSFER); + ASSERT(req->num_object_ids == 1); + ASSERT(memcmp(&oid, &req->object_ids[0], sizeof(object_id)) == 0); + /* Clean up. */ + free(req); + destroy_plasma_mock(remote_mock2); + destroy_plasma_mock(remote_mock1); + remove_object_connection(local_mock->client_conn, local_mock->object_conn); + destroy_plasma_mock(local_mock); + PASS(); +} + +/** + * This test checks correct behavior of request_transfer in a failure scenario. + * Specifically, when one plasma manager calls request_transfer, and the remote + * manager that holds the object is unreachable, the client should receive the + * failure message after all the retries have timed out. + * - Buffer a transfer request for the remote manager. + * - Start and stop the event loop after NUM_RETRIES timeouts to make sure that + * we trigger all the retries. + * - Expect to see a response on the plasma client saying that the object + * wasn't fetched. + */ +TEST request_transfer_timeout_test(void) { + plasma_mock *local_mock = init_plasma_mock(manager_port, NULL); + add_mock_object_conn(local_mock, oid); + plasma_mock *remote_mock = init_plasma_mock(12346, local_mock); + const char **manager_vector = malloc(sizeof(char *)); + manager_vector[0] = "127.0.0.1:12346"; + request_transfer(oid, 1, manager_vector, local_mock->client_conn); + event_loop_add_timer(local_mock->loop, MANAGER_TIMEOUT * (NUM_RETRIES + 2), + test_done_handler, local_mock->state); + event_loop_run(local_mock->loop); + + plasma_reply reply; + int manager_fd = get_manager_fd(local_mock->plasma_conn); + int nbytes = recv(manager_fd, (uint8_t *) &reply, sizeof(reply), MSG_WAITALL); + ASSERT_EQ(nbytes, sizeof(reply)); + ASSERT_EQ(memcmp(&oid, &reply.object_id, sizeof(object_id)), 0); + ASSERT_EQ(reply.has_object, 0); + /* Clean up. */ + destroy_plasma_mock(remote_mock); + destroy_plasma_mock(local_mock); + PASS(); +} + +/** + * This test checks correct behavior of reading and writing an object chunk + * from one manager to another. + * - Write a one-chunk object from the local to the remote manager. + * - Read the object chunk on the remote manager. + * - Expect to see the same data. + */ +TEST read_write_object_chunk_test(void) { + plasma_mock *local_mock = init_plasma_mock(manager_port, NULL); + plasma_mock *remote_mock = init_plasma_mock(12346, local_mock); + /* Create a mock object buffer to transfer. */ + const char *data = "Hello world!"; + const int data_size = strlen(data) + 1; + const int metadata_size = 0; + plasma_request_buffer remote_buf = { + .type = PLASMA_DATA, + .object_id = oid, + .data = (uint8_t *) data, + .data_size = data_size, + .metadata = (uint8_t *) data + data_size, + .metadata_size = metadata_size, + }; + plasma_request_buffer local_buf = { + .object_id = oid, + .data_size = data_size, + .metadata_size = metadata_size, + .data = malloc(data_size), + }; + /* The test: + * - Write the object data from the remote manager to the local. + * - Read the object data on the local manager. + * - Check that the data matches. + */ + write_object_chunk(remote_mock->write_conn, &remote_buf); + int done = read_object_chunk(remote_mock->read_conn, &local_buf); + ASSERT(done); + ASSERT_EQ(memcmp(remote_buf.data, local_buf.data, data_size), 0); + /* Clean up. */ + free(local_buf.data); + destroy_plasma_mock(remote_mock); + destroy_plasma_mock(local_mock); + PASS(); +} + +SUITE(plasma_manager_tests) { + memset(&oid, 1, sizeof(object_id)); + RUN_TEST(request_transfer_test); + RUN_TEST(request_transfer_retry_test); + RUN_TEST(request_transfer_timeout_test); + RUN_TEST(read_write_object_chunk_test); +} + +GREATEST_MAIN_DEFS(); + +int main(int argc, char **argv) { + GREATEST_MAIN_BEGIN(); + RUN_SUITE(plasma_manager_tests); + GREATEST_MAIN_END(); +} diff --git a/src/plasma/test/test.py b/src/plasma/test/test.py index 04cc65c57559..1b7792578493 100644 --- a/src/plasma/test/test.py +++ b/src/plasma/test/test.py @@ -252,16 +252,13 @@ def setUp(self): if USE_VALGRIND: self.p4 = subprocess.Popen(["valgrind", "--track-origins=yes", "--leak-check=full", "--show-leak-kinds=all", "--error-exitcode=1"] + plasma_manager_command1) self.p5 = subprocess.Popen(["valgrind", "--track-origins=yes", "--leak-check=full", "--show-leak-kinds=all", "--error-exitcode=1"] + plasma_manager_command2) - time.sleep(2.0) else: self.p4 = subprocess.Popen(plasma_manager_command1) self.p5 = subprocess.Popen(plasma_manager_command2) - time.sleep(0.1) # Connect two PlasmaClients. self.client1 = plasma.PlasmaClient(store_name1, "127.0.0.1", self.port1) self.client2 = plasma.PlasmaClient(store_name2, "127.0.0.1", self.port2) - time.sleep(0.5) def tearDown(self): # Kill the PlasmaStore and PlasmaManager processes.