llprocess.cpp

/** 
 * @file llprocess.cpp
 * @brief Utility class for launching, terminating, and tracking the state of processes.
 *
 * $LicenseInfo:firstyear=2008&license=viewerlgpl$
 * Second Life Viewer Source Code
 * Copyright (C) 2010, Linden Research, Inc.
 * 
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation;
 * version 2.1 of the License only.
 * 
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 * 
 * Linden Research, Inc., 945 Battery Street, San Francisco, CA  94111  USA
 * $/LicenseInfo$
 */

#include "linden_common.h"
#include "llprocess.h"
#include "llsdserialize.h"
#include "llsingleton.h"
#include "llstring.h"
#include "stringize.h"
#include "llapr.h"
#include "apr_signal.h"
#include "llevents.h"

#include <boost/foreach.hpp>
#include <boost/bind.hpp>
#include <iostream>
#include <stdexcept>

static std::string empty;
static LLProcess::Status interpret_status(int status);

/// Need an exception to avoid constructing an invalid LLProcess object, but
/// internal use only
struct LLProcessError: public std::runtime_error
{
	LLProcessError(const std::string& msg): std::runtime_error(msg) {}
};

/**
 * Ref-counted "mainloop" listener. As long as there are still outstanding
 * LLProcess objects, keep listening on "mainloop" so we can keep polling APR
 * for process status.
 */
class LLProcessListener
{
	LOG_CLASS(LLProcessListener);
public:
	LLProcessListener():
		mCount(0)
	{}

	void addPoll(const LLProcess&)
	{
		// Unconditionally increment mCount. If it was zero before
		// incrementing, listen on "mainloop".
		if (mCount++ == 0)
		{
			LL_DEBUGS("LLProcess") << "listening on \"mainloop\"" << LL_ENDL;
			mConnection = LLEventPumps::instance().obtain("mainloop")
				.listen("LLProcessListener", boost::bind(&LLProcessListener::tick, this, _1));
		}
	}

	void dropPoll(const LLProcess&)
	{
		// Unconditionally decrement mCount. If it's zero after decrementing,
		// stop listening on "mainloop".
		if (--mCount == 0)
		{
			LL_DEBUGS("LLProcess") << "disconnecting from \"mainloop\"" << LL_ENDL;
			mConnection.disconnect();
		}
	}

private:
	/// called once per frame by the "mainloop" LLEventPump
	bool tick(const LLSD&)
	{
		// Tell APR to sense whether each registered LLProcess is still
		// running and call handle_status() appropriately. We should be able
		// to get the same info from an apr_proc_wait(APR_NOWAIT) call; but at
		// least in APR 1.4.2, testing suggests that even with APR_NOWAIT,
		// apr_proc_wait() blocks the caller. We can't have that in the
		// viewer. Hence the callback rigmarole. (Once we update APR, it's
		// probably worth testing again.) Also -- although there's an
		// apr_proc_other_child_refresh() call, i.e. get that information for
		// one specific child, it accepts an 'apr_other_child_rec_t*' that's
		// mentioned NOWHERE else in the documentation or header files! I
		// would use the specific call in LLProcess::getStatus() if I knew
		// how. As it is, each call to apr_proc_other_child_refresh_all() will
		// call callbacks for ALL still-running child processes. That's why we
		// centralize such calls, using "mainloop" to ensure it happens once
		// per frame, and refcounting running LLProcess objects to remain
		// registered only while needed.
		LL_DEBUGS("LLProcess") << "calling apr_proc_other_child_refresh_all()" << LL_ENDL;
		apr_proc_other_child_refresh_all(APR_OC_REASON_RUNNING);
		return false;
	}

	/// If this object is destroyed before mCount goes to zero, stop
	/// listening on "mainloop" anyway.
	LLTempBoundListener mConnection;
	unsigned mCount;
};
static LLProcessListener sProcessListener;

LLProcessPtr LLProcess::create(const LLSDOrParams& params)
{
	try
	{
		return LLProcessPtr(new LLProcess(params));
	}
	catch (const LLProcessError& e)
	{
		LL_WARNS("LLProcess") << e.what() << LL_ENDL;
		return LLProcessPtr();
	}
}

/// Call an apr function returning apr_status_t. On failure, log warning and
/// throw LLProcessError mentioning the function call that produced that
/// result.
#define chkapr(func)                            \
    if (ll_apr_warn_status(func))               \
        throw LLProcessError(#func " failed")

LLProcess::LLProcess(const LLSDOrParams& params):
	mAutokill(params.autokill)
{
	if (! params.validateBlock(true))
	{
		throw LLProcessError(STRINGIZE("not launched: failed parameter validation\n"
									   << LLSDNotationStreamer(params)));
	}

	apr_procattr_t *procattr = NULL;
	chkapr(apr_procattr_create(&procattr, gAPRPoolp));

	// For which of stdin, stdout, stderr should we create a pipe to the
	// child? In the viewer, there are only a couple viable
	// apr_procattr_io_set() alternatives: inherit the viewer's own stdxxx
	// handle (APR_NO_PIPE, e.g. for stdout, stderr), or create a pipe that's
	// blocking on the child end but nonblocking at the viewer end
	// (APR_CHILD_BLOCK). The viewer can't block for anything: the parent end
	// MUST be nonblocking. As the APR documentation itself points out, it
	// makes very little sense to set nonblocking I/O for the child end of a
	// pipe: only a specially-written child could deal with that.
	// Other major options could include explicitly creating a single APR pipe
	// and passing it as both stdout and stderr (apr_procattr_child_out_set(),
	// apr_procattr_child_err_set()), or accepting a filename, opening it and
	// passing that apr_file_t (simple <, >, 2> redirect emulation).
//	chkapr(apr_procattr_io_set(procattr, APR_CHILD_BLOCK, APR_CHILD_BLOCK, APR_CHILD_BLOCK));
	chkapr(apr_procattr_io_set(procattr, APR_NO_PIPE, APR_NO_PIPE, APR_NO_PIPE));

	// Thumbs down on implicitly invoking the shell to invoke the child. From
	// our point of view, the other major alternative to APR_PROGRAM_PATH
	// would be APR_PROGRAM_ENV: still copy environment, but require full
	// executable pathname. I don't see a downside to searching the PATH,
	// though: if our caller wants (e.g.) a specific Python interpreter, s/he
	// can still pass the full pathname.
	chkapr(apr_procattr_cmdtype_set(procattr, APR_PROGRAM_PATH));
	// YES, do extra work if necessary to report child exec() failures back to
	// parent process.
	chkapr(apr_procattr_error_check_set(procattr, 1));
	// Do not start a non-autokill child in detached state. On Posix
	// platforms, this setting attempts to daemonize the new child, closing
	// std handles and the like, and that's a bit more detachment than we
	// want. autokill=false just means not to implicitly kill the child when
	// the parent terminates!
//	chkapr(apr_procattr_detach_set(procattr, params.autokill? 0 : 1));

	if (params.autokill)
	{
#if defined(APR_HAS_PROCATTR_AUTOKILL_SET)
		apr_status_t ok = apr_procattr_autokill_set(procattr, 1);
# if LL_WINDOWS
		// As of 2012-02-02, we only expect this to be implemented on Windows.
		// Avoid spamming the log with warnings we fully expect.
		ll_apr_warn_status(ok);
#else   // ! LL_WINDOWS
		(void)ok;                   // suppress 'unused' warning
# endif // ! LL_WINDOWS
#else
		LL_WARNS("LLProcess") << "This version of APR lacks Linden apr_procattr_autokill_set() extension" << LL_ENDL;
#endif
	}

	// Have to instantiate named std::strings for string params items so their
	// c_str() values persist.
	std::string cwd(params.cwd);
	if (! cwd.empty())
	{
		chkapr(apr_procattr_dir_set(procattr, cwd.c_str()));
	}

	// create an argv vector for the child process
	std::vector<const char*> argv;

	// add the executable path
	std::string executable(params.executable);
	argv.push_back(executable.c_str());

	// and any arguments
	std::vector<std::string> args(params.args.begin(), params.args.end());
	BOOST_FOREACH(const std::string& arg, args)
	{
		argv.push_back(arg.c_str());
	}

	// terminate with a null pointer
	argv.push_back(NULL);

	// Launch! The NULL would be the environment block, if we were passing one.
	chkapr(apr_proc_create(&mProcess, argv[0], &argv[0], NULL, procattr, gAPRPoolp));    

	// arrange to call status_callback()
	apr_proc_other_child_register(&mProcess, &LLProcess::status_callback, this, mProcess.in,
								  gAPRPoolp);
	// and make sure we poll it once per "mainloop" tick
	sProcessListener.addPoll(*this);
	mStatus.mState = RUNNING;

	mDesc = STRINGIZE(LLStringUtil::quote(params.executable) << " (" << mProcess.pid << ')');
	LL_INFOS("LLProcess") << "Launched " << params << " (" << mProcess.pid << ")" << LL_ENDL;

	// Unless caller explicitly turned off autokill (child should persist),
	// take steps to terminate the child. This is all suspenders-and-belt: in
	// theory our destructor should kill an autokill child, but in practice
	// that doesn't always work (e.g. VWR-21538).
	if (params.autokill)
	{
		// Tie the lifespan of this child process to the lifespan of our APR
		// pool: on destruction of the pool, forcibly kill the process. Tell
		// APR to try SIGTERM and wait 3 seconds. If that didn't work, use
		// SIGKILL.
		apr_pool_note_subprocess(gAPRPoolp, &mProcess, APR_KILL_AFTER_TIMEOUT);

		// On Windows, associate the new child process with our Job Object.
		autokill();
	}
}

LLProcess::~LLProcess()
{
	// Only in state RUNNING are we registered for callback. In UNSTARTED we
	// haven't yet registered. And since receiving the callback is the only
	// way we detect child termination, we only change from state RUNNING at
	// the same time we unregister.
	if (mStatus.mState == RUNNING)
	{
		// We're still registered for a callback: unregister. Do it before
		// we even issue the kill(): even if kill() somehow prompted an
		// instantaneous callback (unlikely), this object is going away! Any
		// information updated in this object by such a callback is no longer
		// available to any consumer anyway.
		apr_proc_other_child_unregister(this);
		// One less LLProcess to poll for
		sProcessListener.dropPoll(*this);
	}

	if (mAutokill)
	{
		kill("destructor");
	}
}

bool LLProcess::kill(const std::string& who)
{
	if (isRunning())
	{
		LL_INFOS("LLProcess") << who << " killing " << mDesc << LL_ENDL;

#if LL_WINDOWS
		int sig = -1;
#else  // Posix
		int sig = SIGTERM;
#endif

		ll_apr_warn_status(apr_proc_kill(&mProcess, sig));
	}

	return ! isRunning();
}

bool LLProcess::isRunning(void)
{
	return getStatus().mState == RUNNING;
}

LLProcess::Status LLProcess::getStatus()
{
	return mStatus;
}

std::string LLProcess::getStatusString()
{
	return getStatusString(getStatus());
}

std::string LLProcess::getStatusString(const Status& status)
{
	return getStatusString(mDesc, status);
}

//static
std::string LLProcess::getStatusString(const std::string& desc, const Status& status)
{
	if (status.mState == UNSTARTED)
		return desc + " was never launched";

	if (status.mState == RUNNING)
		return desc + " running";

	if (status.mState == EXITED)
		return STRINGIZE(desc << " exited with code " << status.mData);

	if (status.mState == KILLED)
#if LL_WINDOWS
		return STRINGIZE(desc << " killed with exception " << std::hex << status.mData);
#else
		return STRINGIZE(desc << " killed by signal " << status.mData
						 << " (" << apr_signal_description_get(status.mData) << ")");
#endif

	return STRINGIZE(desc << " in unknown state " << status.mState << " (" << status.mData << ")");
}

// Classic-C-style APR callback
void LLProcess::status_callback(int reason, void* data, int status)
{
	// Our only role is to bounce this static method call back into object
	// space.
	static_cast<LLProcess*>(data)->handle_status(reason, status);
}

#define tabent(symbol) { symbol, #symbol }
static struct ReasonCode
{
	int code;
	const char* name;
} reasons[] =
{
	tabent(APR_OC_REASON_DEATH),
	tabent(APR_OC_REASON_UNWRITABLE),
	tabent(APR_OC_REASON_RESTART),
	tabent(APR_OC_REASON_UNREGISTER),
	tabent(APR_OC_REASON_LOST),
	tabent(APR_OC_REASON_RUNNING)
};
#undef tabent

// Object-oriented callback
void LLProcess::handle_status(int reason, int status)
{
	{
		// This odd appearance of LL_DEBUGS is just to bracket a lookup that will
		// only be performed if in fact we're going to produce the log message.
		LL_DEBUGS("LLProcess") << empty;
		std::string reason_str;
		BOOST_FOREACH(const ReasonCode& rcp, reasons)
		{
			if (reason == rcp.code)
			{
				reason_str = rcp.name;
				break;
			}
		}
		if (reason_str.empty())
		{
			reason_str = STRINGIZE("unknown reason " << reason);
		}
		LL_CONT << mDesc << ": handle_status(" << reason_str << ", " << status << ")" << LL_ENDL;
	}

	if (! (reason == APR_OC_REASON_DEATH || reason == APR_OC_REASON_LOST))
	{
		// We're only interested in the call when the child terminates.
		return;
	}

	// Somewhat oddly, APR requires that you explicitly unregister even when
	// it already knows the child has terminated. We must pass the same 'data'
	// pointer as for the register() call, which was our 'this'.
	apr_proc_other_child_unregister(this);
	// don't keep polling for a terminated process
	sProcessListener.dropPoll(*this);
	// We overload mStatus.mState to indicate whether the child is registered
	// for APR callback: only RUNNING means registered. Track that we've
	// unregistered. We know the child has terminated; might be EXITED or
	// KILLED; refine below.
	mStatus.mState = EXITED;

//	wi->rv = apr_proc_wait(wi->child, &wi->rc, &wi->why, APR_NOWAIT);
	// It's just wrong to call apr_proc_wait() here. The only way APR knows to
	// call us with APR_OC_REASON_DEATH is that it's already reaped this child
	// process, so calling wait() will only produce "huh?" from the OS. We
	// must rely on the status param passed in, which unfortunately comes
	// straight from the OS wait() call, which means we have to decode it by
	// hand.
	mStatus = interpret_status(status);
	LL_INFOS("LLProcess") << getStatusString() << LL_ENDL;
}

LLProcess::id LLProcess::getProcessID() const
{
	return mProcess.pid;
}

LLProcess::handle LLProcess::getProcessHandle() const
{
#if LL_WINDOWS
	return mProcess.hproc;
#else
	return mProcess.pid;
#endif
}

std::ostream& operator<<(std::ostream& out, const LLProcess::Params& params)
{
	std::string cwd(params.cwd);
	if (! cwd.empty())
	{
		out << "cd " << LLStringUtil::quote(cwd) << ": ";
	}
	out << LLStringUtil::quote(params.executable);
	BOOST_FOREACH(const std::string& arg, params.args)
	{
		out << ' ' << LLStringUtil::quote(arg);
	}
	return out;
}

/*****************************************************************************
*   Windows specific
*****************************************************************************/
#if LL_WINDOWS

static std::string WindowsErrorString(const std::string& operation);

void LLProcess::autokill()
{
	// hopefully now handled by apr_procattr_autokill_set()
}

LLProcess::handle LLProcess::isRunning(handle h, const std::string& desc)
{
	// This direct Windows implementation is because we have no access to the
	// apr_proc_t struct: we expect it's been destroyed.
	if (! h)
		return 0;

	DWORD waitresult = WaitForSingleObject(h, 0);
	if(waitresult == WAIT_OBJECT_0)
	{
		// the process has completed.
		if (! desc.empty())
		{
			DWORD status = 0;
			if (! GetExitCodeProcess(h, &status))
			{
				LL_WARNS("LLProcess") << desc << " terminated, but "
									  << WindowsErrorString("GetExitCodeProcess()") << LL_ENDL;
			}
			{
				LL_INFOS("LLProcess") << getStatusString(desc, interpret_status(status))
									  << LL_ENDL;
			}
		}
		CloseHandle(h);
		return 0;
	}

	return h;
}

static LLProcess::Status interpret_status(int status)
{
	LLProcess::Status result;

	// This bit of code is cribbed from apr/threadproc/win32/proc.c, a
	// function (unfortunately static) called why_from_exit_code():
	/* See WinNT.h STATUS_ACCESS_VIOLATION and family for how
	 * this class of failures was determined
	 */
	if ((status & 0xFFFF0000) == 0xC0000000)
	{
		result.mState = LLProcess::KILLED;
	}
	else
	{
		result.mState = LLProcess::EXITED;
	}
	result.mData = status;

	return result;
}

/// GetLastError()/FormatMessage() boilerplate
static std::string WindowsErrorString(const std::string& operation)
{
	int result = GetLastError();

	LPTSTR error_str = 0;
	if (FormatMessage( FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM,
					   NULL,
					   result,
					   0,
					   (LPTSTR)&error_str,
					   0,
					   NULL)
		!= 0) 
	{
        // convert from wide-char string to multi-byte string
		char message[256];
		wcstombs(message, error_str, sizeof(message));
		message[sizeof(message)-1] = 0;
		LocalFree(error_str);
		// convert to std::string to trim trailing whitespace
		std::string mbsstr(message);
		mbsstr.erase(mbsstr.find_last_not_of(" \t\r\n"));
		return STRINGIZE(operation << " failed (" << result << "): " << mbsstr);
	}
	return STRINGIZE(operation << " failed (" << result
					 << "), but FormatMessage() did not explain");
}

/*****************************************************************************
*   Posix specific
*****************************************************************************/
#else // Mac and linux

#include <signal.h>
#include <fcntl.h>
#include <errno.h>
#include <sys/wait.h>

void LLProcess::autokill()
{
	// What we ought to do here is to:
	// 1. create a unique process group and run all autokill children in that
	//    group (see https://jira.secondlife.com/browse/SWAT-563);
	// 2. figure out a way to intercept control when the viewer exits --
	//    gracefully or not; 
	// 3. when the viewer exits, kill off the aforementioned process group.

	// It's point 2 that's troublesome. Although I've seen some signal-
	// handling logic in the Posix viewer code, I haven't yet found any bit of
	// code that's run no matter how the viewer exits (a try/finally for the
	// whole process, as it were).
}

// Attempt to reap a process ID -- returns true if the process has exited and been reaped, false otherwise.
static bool reap_pid(pid_t pid, LLProcess::Status* pstatus=NULL)
{
	LLProcess::Status dummy;
	if (! pstatus)
	{
		// If caller doesn't want to see Status, give us a target anyway so we
		// don't have to have a bunch of conditionals.
		pstatus = &dummy;
	}

	int status = 0;
	pid_t wait_result = ::waitpid(pid, &status, WNOHANG);
	if (wait_result == pid)
	{
		*pstatus = interpret_status(status);
		return true;
	}
	if (wait_result == 0)
	{
		pstatus->mState = LLProcess::RUNNING;
		pstatus->mData	= 0;
		return false;
	}

	// Clear caller's Status block; caller must interpret UNSTARTED to mean
	// "if this PID was ever valid, it no longer is."
	*pstatus = LLProcess::Status();

	// We've dealt with the success cases: we were able to reap the child
	// (wait_result == pid) or it's still running (wait_result == 0). It may
	// be that the child terminated but didn't hang around long enough for us
	// to reap. In that case we still have no Status to report, but we can at
	// least state that it's not running.
	if (wait_result == -1 && errno == ECHILD)
	{
		// No such process -- this may mean we're ignoring SIGCHILD.
		return true;
	}

	// Uh, should never happen?!
	LL_WARNS("LLProcess") << "LLProcess::reap_pid(): waitpid(" << pid << ") returned "
						  << wait_result << "; not meaningful?" << LL_ENDL;
	// If caller is looping until this pid terminates, and if we can't find
	// out, better to break the loop than to claim it's still running.
	return true;
}

LLProcess::id LLProcess::isRunning(id pid, const std::string& desc)
{
	// This direct Posix implementation is because we have no access to the
	// apr_proc_t struct: we expect it's been destroyed.
	if (! pid)
		return 0;

	// Check whether the process has exited, and reap it if it has.
	LLProcess::Status status;
	if(reap_pid(pid, &status))
	{
		// the process has exited.
		if (! desc.empty())
		{
			std::string statstr(desc + " apparently terminated: no status available");
			// We don't just pass UNSTARTED to getStatusString() because, in
			// the context of reap_pid(), that state has special meaning.
			if (status.mState != UNSTARTED)
			{
				statstr = getStatusString(desc, status);
			}
			LL_INFOS("LLProcess") << statstr << LL_ENDL;
		}
		return 0;
	}

	return pid;
}

static LLProcess::Status interpret_status(int status)
{
	LLProcess::Status result;

	if (WIFEXITED(status))
	{
		result.mState = LLProcess::EXITED;
		result.mData  = WEXITSTATUS(status);
	}
	else if (WIFSIGNALED(status))
	{
		result.mState = LLProcess::KILLED;
		result.mData  = WTERMSIG(status);
	}
	else                            // uh, shouldn't happen?
	{
		result.mState = LLProcess::EXITED;
		result.mData  = status;     // someone else will have to decode
	}

	return result;
}

/*==========================================================================*|
static std::list<pid_t> sZombies;

void LLProcess::orphan(void)
{
	// Disassociate the process from this object
	if(mProcessID != 0)
	{	
		// We may still need to reap the process's zombie eventually
		sZombies.push_back(mProcessID);
	
		mProcessID = 0;
	}
}

// static 
void LLProcess::reap(void)
{
	// Attempt to real all saved process ID's.
	
	std::list<pid_t>::iterator iter = sZombies.begin();
	while(iter != sZombies.end())
	{
		if(reap_pid(*iter))
		{
			iter = sZombies.erase(iter);
		}
		else
		{
			iter++;
		}
	}
}
|*==========================================================================*/

#endif  // Posix