From: Paul H. Hargrove (PHHargrove_at_lbl_dot_gov)
Date: Wed Dec 01 2004 - 16:34:33 PST
Zoltan, You may wish to try the attached patch to cr_checkpoint.c, which implements the --fd flag I proposed in an earlier message. Then you should be able to checkpoint to a socket with something like: snprintf(cmd_buffer, cmd_buffer_len, "cr_checkpoint --fd %d --pid %d", socket_fd, target_pid); system(cmd_buffer); If you do try this, please let us know how it turns out. -Paul Paul H. Hargrove wrote: > Zoltan, > > I think something like the following might work: > > len = snprintf(cmd_buffer, cmd_buffer_len, "cr_checkpoint --clobber > --file /proc/self/fd/%d --pid %d", socket_fd, target_pid); > if (len >= cmd_buffer_len) { > /* cmd_buffer too small, deal with it */ > exit(1); > } > rc = system(cmd_buffer); > > The use of /proc/self/fd/%d will cause the checkpoint to go to an > existing open file descriptor. The --clobber is needed to ensure it > goes straight there rather than going to a temporary file that is then > renamed to the destination (which would fail). Note that this will fail > if the socket descriptor is close-on-exec. > > I think it would be easy to implement "--fd <N>" to get the same > behavior as "--clobber --file /proc/self/fd/<N>". In fact, I've added a > request for this feature to our bug database > (http://mantis.lbl.gov/bugzilla) as bug #882. > > -Paul > > JCDuell_at_lbl_dot_gov wrote: > >> Zoltan: >> >> We have not tested checkpointing to a socket, but it should work. We >> are planning to move most of the logic currently in cr_checkpoint into a >> set of library routines, which will allow a socket file descriptor to be >> passed in instead of a regular file. In the meantime, a relatively >> small amount of hacking in cr_checkpoint.c could allow you to do the >> same thing--just add a '--socket <address>' flag, and have it open up a >> TCP socket instead of a file. >> >> Cheers, >> > > -- Paul H. Hargrove PHHargrove_at_lbl_dot_gov Future Technologies Group HPC Research Department Tel: +1-510-495-2352 Lawrence Berkeley National Laboratory Fax: +1-510-486-6900 Index: util/cr_checkpoint/cr_checkpoint.c =================================================================== RCS file: /var/local/cvs/lbnl_cr/util/cr_checkpoint/cr_checkpoint.c,v retrieving revision 1.44 retrieving revision 1.46 diff -c -r1.44 -r1.46 *** util/cr_checkpoint/cr_checkpoint.c 12 Oct 2004 20:35:59 -0000 1.44 --- util/cr_checkpoint/cr_checkpoint.c 2 Dec 2004 00:26:50 -0000 1.46 *************** *** 21,27 **** * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * ! * $Id: cr_checkpoint.c,v 1.44 2004/10/12 20:35:59 phargrov Exp $ */ --- 21,27 ---- * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * ! * $Id: cr_checkpoint.c,v 1.46 2004/12/02 00:26:50 phargrov Exp $ */ *************** *** 64,85 **** "General options:\n" " -v, --verbose print progress messages to stderr.\n" " -?, --help print this message and exit.\n" "Options for scope of the checkpoint:\n" " -p, --pid, --process ID identifies a process id (default).\n" " -g, --pgid, --group ID identifies a process group id.\n" " -s, --sid, --session ID identifies a session id.\n" "Options for destination location of the checkpoint:\n" " -c, --cwd checkpoint saved as a single 'context.ID' file in\n" " cr_checkpoint's working directory (default).\n" " -d, --dir DIR checkpoint saved in new directory DIR, with one\n" ! " 'context.ID' file per process.\n" " -f, --file FILE checkpoint saved as FILE.\n" ! "Options for creation/replacement policy for checkpoint files\n" " --atomic checkpoint created/replaced atomically (default).\n" " --backup[=NAME] checkpoint created atomically, and any existing \n" " checkpoint backed up to NAME or *.~1~, *.~2~, etc.\n" " --clobber checkpoint written incrementally to target, \n" " overwriting any pre-existing checkpoint.\n" "Options for signal sent to process(es) after checkpoint:\n" " --run no signal sent: continue execution (default).\n" " -S, --signal NUM signal NUM sent to all processes/threads.\n" --- 64,91 ---- "General options:\n" " -v, --verbose print progress messages to stderr.\n" " -?, --help print this message and exit.\n" + "\n" "Options for scope of the checkpoint:\n" " -p, --pid, --process ID identifies a process id (default).\n" " -g, --pgid, --group ID identifies a process group id.\n" " -s, --sid, --session ID identifies a session id.\n" + "\n" "Options for destination location of the checkpoint:\n" " -c, --cwd checkpoint saved as a single 'context.ID' file in\n" " cr_checkpoint's working directory (default).\n" " -d, --dir DIR checkpoint saved in new directory DIR, with one\n" ! " 'context.ID' file per process (unimplemented).\n" " -f, --file FILE checkpoint saved as FILE.\n" ! " -F, --fd FD checkpoint written to an open file descriptor.\n" ! "\n" ! "Options for creation/replacement policy for checkpoint files:\n" " --atomic checkpoint created/replaced atomically (default).\n" " --backup[=NAME] checkpoint created atomically, and any existing \n" " checkpoint backed up to NAME or *.~1~, *.~2~, etc.\n" " --clobber checkpoint written incrementally to target, \n" " overwriting any pre-existing checkpoint.\n" + "These options are ignored if the destination is a file descriptor.\n" + "\n" "Options for signal sent to process(es) after checkpoint:\n" " --run no signal sent: continue execution (default).\n" " -S, --signal NUM signal NUM sent to all processes/threads.\n" *************** *** 87,95 **** --- 93,103 ---- " --term SIGTERM sent to all processes/threads.\n" /* " --kill SIGKILL sent to all processes/threads.\n" */ " --abort SIGABRT sent to all processes/threads.\n" + "\n" "Options for file system synchronization (default is --sync):\n" " --sync fsync checkpoint file(s) to disk (default).\n" " --nosync do not fsync checkpoint file(s) to disk.\n" + "\n" "Misc Options:\n" " -t, --time SEC allow only SEC seconds for target to complete\n" " checkpoint (default: wait indefinitely).\n" *************** *** 239,248 **** --- 247,265 ---- opt_version }; + /* Type of destination */ + enum { + dest_default, + dest_file, + dest_dir, + dest_fd, + dest_cwd + }; int main(int argc, char **argv) { struct cr_chkpt_req req; + int dest_type = dest_default; int chkpt_fd = -1; int do_atomic = 1; int do_backup = 0; *************** *** 261,267 **** int signal = 0; /* Parse cmdline options */ ! char * shortflags = "f:d:S:pgsct:vh?"; /* 1 colon == requires argument */ struct option longflags[] = { /* target_type: */ { "pid", no_argument , 0, 'p' }, --- 278,284 ---- int signal = 0; /* Parse cmdline options */ ! char * shortflags = "f:d:F:S:pgsct:vh?"; /* 1 colon == requires argument */ struct option longflags[] = { /* target_type: */ { "pid", no_argument , 0, 'p' }, *************** *** 273,278 **** --- 290,296 ---- /* destination: */ { "file", required_argument, 0, 'f' }, { "dir", required_argument, 0, 'd' }, + { "fd", required_argument, 0, 'F' }, { "cwd", no_argument, 0, 'c' }, /* creation/replacement policy */ { "atomic", no_argument, 0, opt_atomic }, *************** *** 293,299 **** { "verbose", no_argument, 0, 'v' }, { "help", no_argument, 0, '?' }, { "version", no_argument, 0, opt_version }, ! { 0, 0, 0, 0 } }; while (1) { --- 311,317 ---- { "verbose", no_argument, 0, 'v' }, { "help", no_argument, 0, '?' }, { "version", no_argument, 0, opt_version }, ! { 0, 0, 0, 0 } }; while (1) { *************** *** 316,323 **** die(EINVAL, "Checkpointing of sessions not yet implemented\n"); break; case 'f': ! if (chkpt_dir != NULL) ! die (EINVAL, "only one of -f and -d may be provided\n"); chkpt_file = strdup(optarg); if (!chkpt_file) die(ENOMEM, "strdup failed on string '%s'\n", optarg); --- 334,342 ---- die(EINVAL, "Checkpointing of sessions not yet implemented\n"); break; case 'f': ! if (dest_type != dest_default) ! die (EINVAL, "conflicting destinations provided\n"); ! dest_type = dest_file; chkpt_file = strdup(optarg); if (!chkpt_file) die(ENOMEM, "strdup failed on string '%s'\n", optarg); *************** *** 326,339 **** #if 1 die(EINVAL, "-d flag not yet implemented\n"); #else ! if (chkpt_file != NULL) ! die (EINVAL, "only one of -f and -d may be provided\n"); chkpt_dir = strdup(optarg); if (!chkpt_dir) die(ENOMEM, "strdup failed on string '%s'\n", optarg); break; #endif case 'c': /* nothing to do */ break; case opt_atomic: --- 345,368 ---- #if 1 die(EINVAL, "-d flag not yet implemented\n"); #else ! if (dest_type != dest_default) ! die (EINVAL, "conflicting destinations specified\n"); ! dest_type = dest_dir; chkpt_dir = strdup(optarg); if (!chkpt_dir) die(ENOMEM, "strdup failed on string '%s'\n", optarg); break; #endif + case 'F': + if (dest_type != dest_default) + die (EINVAL, "conflicting destinations specified\n"); + dest_type = dest_fd; + chkpt_fd = readint(optarg, argv[0]); + break; case 'c': + if (dest_type != dest_default) + die (EINVAL, "conflicting destinations specified\n"); + dest_type = dest_cwd; /* nothing to do */ break; case opt_atomic: *************** *** 414,453 **** usage(stderr); } ! /* default file name = context.ID */ ! if (!chkpt_dir && !chkpt_file) { #define NAMELEN 64 char buf[NAMELEN]; ! snprintf(buf, NAMELEN, "context.%d", target); ! chkpt_file = strdup(buf); ! if (!chkpt_file) die(errno, "Could not duplicate string '%s'\n", buf); ! } ! if (!(chkpt_to = chkpt_dir)) ! chkpt_to = chkpt_file; ! /* get parent directory name */ ! parent_dir = strdup(chkpt_to); ! if (!parent_dir) ! die(errno, "Error in strdup: %s", strerror(errno)); ! parent_dir = dirname(parent_dir); ! if (do_atomic) { ! char *base = strdup(chkpt_to); ! if (!base) ! die(errno, "Error in strdup: %s\n", strerror(errno)); ! /* save final target name */ ! rename_to = chkpt_to; ! /* use '.context.pid.tmp'-style name for checkpoint file */ ! chkpt_to = (char *)malloc(strlen(chkpt_to) + 10); ! if (!chkpt_to) ! die(errno, "Malloc failed!\n"); ! strcpy(chkpt_to, parent_dir); ! strcat(chkpt_to, "/."); ! strcat(chkpt_to, basename(base)); ! strcat(chkpt_to, ".tmp"); ! free(base); } if (verbose) --- 443,500 ---- usage(stderr); } ! if (dest_type == dest_fd) { #define NAMELEN 64 char buf[NAMELEN]; ! snprintf(buf, NAMELEN, "<fd%d>", chkpt_fd); ! chkpt_to = strdup(buf); ! if (!chkpt_to) die(errno, "Could not duplicate string '%s'\n", buf); ! } else { ! char buf[NAMELEN]; ! switch (dest_type) { ! case dest_dir: ! chkpt_to = chkpt_dir; ! break; ! case dest_default: /* default -> cwd */ ! case dest_cwd: ! snprintf(buf, NAMELEN, "context.%d", target); ! chkpt_file = strdup(buf); ! if (!chkpt_file) ! die(errno, "Could not duplicate string '%s'\n", buf); ! /* fall through */ ! case dest_file: ! chkpt_to = chkpt_file; ! break; ! default: ! die(1, "Invalid dest_type\n"); ! } ! ! /* get parent directory name */ ! parent_dir = strdup(chkpt_to); ! if (!parent_dir) ! die(errno, "Error in strdup: %s", strerror(errno)); ! parent_dir = dirname(parent_dir); ! ! if (do_atomic) { ! char *base = strdup(chkpt_to); ! if (!base) ! die(errno, "Error in strdup: %s\n", strerror(errno)); ! /* save final target name */ ! rename_to = chkpt_to; ! /* use '.context.pid.tmp'-style name for checkpoint file */ ! chkpt_to = (char *)malloc(strlen(chkpt_to) + 10); ! if (!chkpt_to) ! die(errno, "Malloc failed!\n"); ! strcpy(chkpt_to, parent_dir); ! strcat(chkpt_to, "/."); ! strcat(chkpt_to, basename(base)); ! strcat(chkpt_to, ".tmp"); ! free(base); ! } } if (verbose) *************** *** 455,469 **** chkpt_to, parent_dir, rename_to); /* TODO: make sure no other checkpoint is occurring to the same file? */ ! if (chkpt_file) { if ((chkpt_fd = openfile(chkpt_to)) == -1) die(errno, "Unable to open checkpoint file '%s'\n", chkpt_file); } else { die(EINVAL, "directories not yet supported\n"); } - /* after this point, remove checkpoint file if there's an error */ - to_remove = chkpt_to; /* BUILD THE REQUEST */ req.cr_target = target; --- 502,519 ---- chkpt_to, parent_dir, rename_to); /* TODO: make sure no other checkpoint is occurring to the same file? */ ! if (chkpt_fd >= 0) { ! /* silently ignore the atomic/backup flags */ ! do_atomic = do_backup = 0; ! } else if (chkpt_file) { if ((chkpt_fd = openfile(chkpt_to)) == -1) die(errno, "Unable to open checkpoint file '%s'\n", chkpt_file); + /* after this point, remove checkpoint file if there's an error */ + to_remove = chkpt_to; } else { die(EINVAL, "directories not yet supported\n"); } /* BUILD THE REQUEST */ req.cr_target = target; *************** *** 555,569 **** if (err < 0) die(errno, "Error syncing checkpoint to disk: %s\n", strerror(errno)); ! /* sync parent directory, too, to ensure checkpoint shows up */ ! if (!(dir = opendir(parent_dir))) ! die(errno, "unable to opendir(%s): %s\n", ! parent_dir, strerror(errno)); ! /* ignore fsync errors that might be from using NFS */ ! if (fsync(dirfd(dir)) && errno != EROFS && errno != EINVAL) ! fprintf(stderr, "Warning: unable to sync directory '%s': errno=%d\n", ! parent_dir, errno); ! /* TODO: if checkpoint to directory, fsync all context.pid files */ } return 0; --- 605,621 ---- if (err < 0) die(errno, "Error syncing checkpoint to disk: %s\n", strerror(errno)); ! if (parent_dir != NULL) { ! /* sync parent directory, too, to ensure checkpoint shows up */ ! if (!(dir = opendir(parent_dir))) ! die(errno, "unable to opendir(%s): %s\n", ! parent_dir, strerror(errno)); ! /* ignore fsync errors that might be from using NFS */ ! if (fsync(dirfd(dir)) && errno != EROFS && errno != EINVAL) ! fprintf(stderr, "Warning: unable to sync directory '%s': errno=%d\n", ! parent_dir, errno); ! /* TODO: if checkpoint to directory, fsync all context.pid files */ ! } } return 0;