diff --git a/indra/llcommon/CMakeLists.txt b/indra/llcommon/CMakeLists.txt
index bf99a4c3a04127934ef39d65f96ab159d7e21c23..0c76fd46c0fcc1abc231c97cd2660cc06e50e60e 100755
--- a/indra/llcommon/CMakeLists.txt
+++ b/indra/llcommon/CMakeLists.txt
@@ -103,6 +103,7 @@ set(llcommon_SOURCE_FILES
     llthreadsafequeue.cpp
     lltimer.cpp
     lltrace.cpp
+    lltraceaccumulators.cpp
     lltracerecording.cpp
     lltracethreadrecorder.cpp
     lluri.cpp
@@ -231,6 +232,7 @@ set(llcommon_HEADER_FILES
     llthreadsafequeue.h
     lltimer.h
     lltrace.h
+	lltraceaccumulators.h
     lltracerecording.h
     lltracethreadrecorder.h
     lltreeiterators.h
diff --git a/indra/llcommon/llfasttimer.h b/indra/llcommon/llfasttimer.h
index 642c99ccce1f9702e67b0771d7a1bc6e1f92d9cf..ab8612a8ade8b0e963fa1f7d4c47ca6d59eaf8ca 100755
--- a/indra/llcommon/llfasttimer.h
+++ b/indra/llcommon/llfasttimer.h
@@ -38,13 +38,6 @@ class LLMutex;
 namespace LLTrace
 {
 
-struct BlockTimerStackRecord
-{
-	class BlockTimer*	mActiveTimer;
-	class TimeBlock*	mTimeBlock;
-	U64					mChildTime;
-};
-
 class ThreadTimerStack 
 :	public BlockTimerStackRecord, 
 	public LLThreadLocalSingleton<ThreadTimerStack>
diff --git a/indra/llcommon/llthread.cpp b/indra/llcommon/llthread.cpp
index 118568d5efd39cdf8da469e59fb354393a4af9e0..e8e546e76944faccbcbe4a95a45360c510e00131 100755
--- a/indra/llcommon/llthread.cpp
+++ b/indra/llcommon/llthread.cpp
@@ -93,7 +93,7 @@ void *APR_THREAD_FUNC LLThread::staticRun(apr_thread_t *apr_threadp, void *datap
 {
 	LLThread *threadp = (LLThread *)datap;
 
-	LLTrace::ThreadRecorder* thread_recorder = new LLTrace::SlaveThreadRecorder(LLTrace::getUIThreadRecorder());
+	LLTrace::SlaveThreadRecorder thread_recorder(LLTrace::getUIThreadRecorder());
 
 #if !LL_DARWIN
 	sThreadID = threadp->mID;
@@ -107,8 +107,6 @@ void *APR_THREAD_FUNC LLThread::staticRun(apr_thread_t *apr_threadp, void *datap
 	// We're done with the run function, this thread is done executing now.
 	threadp->mStatus = STOPPED;
 
-	delete thread_recorder;
-
 	return NULL;
 }
 
diff --git a/indra/llcommon/llthreadlocalstorage.cpp b/indra/llcommon/llthreadlocalstorage.cpp
index 32d94331a6c477b76cd12784a9a60c423304fdb0..03c306cc7f53aa91baf16c581ccf33b73c47aece 100644
--- a/indra/llcommon/llthreadlocalstorage.cpp
+++ b/indra/llcommon/llthreadlocalstorage.cpp
@@ -88,6 +88,7 @@ void LLThreadLocalPointerBase::destroyStorage()
 	}
 }
 
+//static
 void LLThreadLocalPointerBase::initAllThreadLocalStorage()
 {
 	if (!sInitialized)
@@ -102,6 +103,7 @@ void LLThreadLocalPointerBase::initAllThreadLocalStorage()
 	}
 }
 
+//static
 void LLThreadLocalPointerBase::destroyAllThreadLocalStorage()
 {
 	if (sInitialized)
diff --git a/indra/llcommon/llthreadlocalstorage.h b/indra/llcommon/llthreadlocalstorage.h
index a15f9185b1e16b1d322429e926ec15a0a2c8b69d..d6399d5131f99714cf40b322d612510e0b2a95b8 100644
--- a/indra/llcommon/llthreadlocalstorage.h
+++ b/indra/llcommon/llthreadlocalstorage.h
@@ -145,7 +145,7 @@ class LLThreadLocalSingleton
 #if LL_DARWIN
         pthread_setspecific(sInstanceKey, NULL);
 #else
-        sInstance = NULL;
+        sData.mInstance = NULL;
 #endif
 		setInitState(DELETED);
 	}
@@ -182,7 +182,7 @@ class LLThreadLocalSingleton
                 llerrs << "Could not set thread local storage" << llendl;
             }
 #else
-			sInstance = instancep;
+			sData.mInstance = instancep;
 #endif
 			setInitState(INITIALIZING);
 			instancep->initSingleton();
@@ -197,7 +197,7 @@ class LLThreadLocalSingleton
 #if LL_DARWIN
         return (DERIVED_TYPE*)pthread_getspecific(sInstanceKey);
 #else
-		return sInstance;
+		return sData.mInstance;
 #endif
 	}
 
@@ -247,7 +247,7 @@ class LLThreadLocalSingleton
         createTLSInitState();
         return (EInitState)(int)pthread_getspecific(sInitStateKey);
 #else
-        return sInitState;
+        return sData.mInitState;
 #endif
     }
     
@@ -257,18 +257,21 @@ class LLThreadLocalSingleton
         createTLSInitState();
         pthread_setspecific(sInitStateKey, (void*)state);
 #else
-        sInitState = state;
+        sData.mInitState = state;
 #endif
     }
 	LLThreadLocalSingleton(const LLThreadLocalSingleton& other);
 	virtual void initSingleton() {}
 
+	struct SingletonData
+	{
+		DERIVED_TYPE*	mInstance;
+		EInitState		mInitState;
+	};
 #ifdef LL_WINDOWS
-	static __declspec(thread) DERIVED_TYPE* sInstance;
-	static __declspec(thread) EInitState sInitState;
+	static __declspec(thread) SingletonData sData;
 #elif LL_LINUX
-	static __thread DERIVED_TYPE* sInstance;
-	static __thread EInitState sInitState;
+	static __thread SingletonData sData;
 #elif LL_DARWIN
     static pthread_key_t sInstanceKey;
     static pthread_key_t sInitStateKey;
@@ -277,16 +280,10 @@ class LLThreadLocalSingleton
 
 #if LL_WINDOWS
 template<typename DERIVED_TYPE>
-__declspec(thread) DERIVED_TYPE* LLThreadLocalSingleton<DERIVED_TYPE>::sInstance = NULL;
-
-template<typename DERIVED_TYPE>
-__declspec(thread) typename LLThreadLocalSingleton<DERIVED_TYPE>::EInitState LLThreadLocalSingleton<DERIVED_TYPE>::sInitState = LLThreadLocalSingleton<DERIVED_TYPE>::UNINITIALIZED;
+__declspec(thread) typename LLThreadLocalSingleton<DERIVED_TYPE>::SingletonData LLThreadLocalSingleton<DERIVED_TYPE>::sData = {NULL, LLThreadLocalSingleton<DERIVED_TYPE>::UNINITIALIZED};
 #elif LL_LINUX
 template<typename DERIVED_TYPE>
-__thread DERIVED_TYPE* LLThreadLocalSingleton<DERIVED_TYPE>::sInstance = NULL;
-
-template<typename DERIVED_TYPE>
-__thread typename LLThreadLocalSingleton<DERIVED_TYPE>::EInitState LLThreadLocalSingleton<DERIVED_TYPE>::sInitState = LLThreadLocalSingleton<DERIVED_TYPE>::UNINITIALIZED;
+__thread typename LLThreadLocalSingleton<DERIVED_TYPE>::SingletonData LLThreadLocalSingleton<DERIVED_TYPE>::sData = {NULL, LLThreadLocalSingleton<DERIVED_TYPE>::UNINITIALIZED};
 #elif LL_DARWIN
 template<typename DERIVED_TYPE>
 pthread_key_t LLThreadLocalSingleton<DERIVED_TYPE>::sInstanceKey;
diff --git a/indra/llcommon/lltrace.cpp b/indra/llcommon/lltrace.cpp
index 59a4b42c971da09efffaffd7bbca4e2e8b339b45..25807c7b2cd00f7a78d257840dc06307bbbff247 100644
--- a/indra/llcommon/lltrace.cpp
+++ b/indra/llcommon/lltrace.cpp
@@ -35,8 +35,6 @@ static S32 sInitializationCount = 0;
 namespace LLTrace
 {
 
-static MasterThreadRecorder* gUIThreadRecorder = NULL;
-
 void init()
 {
 	if (sInitializationCount++ == 0)
@@ -59,28 +57,6 @@ void cleanup()
 	}
 }
 
-MasterThreadRecorder& getUIThreadRecorder()
-{
-	llassert(gUIThreadRecorder != NULL);
-	return *gUIThreadRecorder;
-}
-
-LLThreadLocalPointer<ThreadRecorder>& get_thread_recorder_ptr()
-{
-	static LLThreadLocalPointer<ThreadRecorder> s_thread_recorder;
-	return s_thread_recorder;
-}
-
-const LLThreadLocalPointer<ThreadRecorder>& get_thread_recorder()
-{
-	return get_thread_recorder_ptr();
-}
-
-void set_thread_recorder(ThreadRecorder* recorder)
-{
-	get_thread_recorder_ptr() = recorder;
-}
-
 
 TimeBlockTreeNode::TimeBlockTreeNode() 
 :	mBlock(NULL),
diff --git a/indra/llcommon/lltrace.h b/indra/llcommon/lltrace.h
index 884a316a3b5408cf4059619273c17e0a4f09971c..72ef51c232250c49bc3dd57872eb1001e2251bbb 100644
--- a/indra/llcommon/lltrace.h
+++ b/indra/llcommon/lltrace.h
@@ -32,7 +32,7 @@
 
 #include "llmemory.h"
 #include "llrefcount.h"
-#include "llunit.h"
+#include "lltraceaccumulators.h"
 #include "llthreadlocalstorage.h"
 #include "lltimer.h"
 
@@ -57,185 +57,6 @@ void init();
 void cleanup();
 bool isInitialized();
 
-const LLThreadLocalPointer<class ThreadRecorder>& get_thread_recorder();
-void set_thread_recorder(class ThreadRecorder*);
-
-class MasterThreadRecorder& getUIThreadRecorder();
-
-template<typename ACCUMULATOR>
-class AccumulatorBuffer : public LLRefCount
-{
-	typedef AccumulatorBuffer<ACCUMULATOR> self_t;
-	static const U32 DEFAULT_ACCUMULATOR_BUFFER_SIZE = 64;
-private:
-	struct StaticAllocationMarker { };
-
-	AccumulatorBuffer(StaticAllocationMarker m)
-	:	mStorageSize(0),
-		mStorage(NULL)
-	{}
-
-public:
-
-	AccumulatorBuffer(const AccumulatorBuffer& other = *getDefaultBuffer())
-	:	mStorageSize(0),
-		mStorage(NULL)
-	{
-		resize(other.mStorageSize);
-		for (S32 i = 0; i < sNextStorageSlot; i++)
-		{
-			mStorage[i] = other.mStorage[i];
-		}
-	}
-
-	~AccumulatorBuffer()
-	{
-		if (isPrimary())
-		{
-			LLThreadLocalSingletonPointer<ACCUMULATOR>::setInstance(NULL);
-		}
-		delete[] mStorage;
-	}
-
-	LL_FORCE_INLINE ACCUMULATOR& operator[](size_t index) 
-	{ 
-		return mStorage[index]; 
-	}
-
-	LL_FORCE_INLINE const ACCUMULATOR& operator[](size_t index) const
-	{ 
-		return mStorage[index]; 
-	}
-
-	void addSamples(const AccumulatorBuffer<ACCUMULATOR>& other, bool append = true)
-	{
-		llassert(mStorageSize >= sNextStorageSlot && other.mStorageSize > sNextStorageSlot);
-		for (size_t i = 0; i < sNextStorageSlot; i++)
-		{
-			mStorage[i].addSamples(other.mStorage[i], append);
-		}
-	}
-
-	void copyFrom(const AccumulatorBuffer<ACCUMULATOR>& other)
-	{
-		llassert(mStorageSize >= sNextStorageSlot && other.mStorageSize > sNextStorageSlot);
-		for (size_t i = 0; i < sNextStorageSlot; i++)
-		{
-			mStorage[i] = other.mStorage[i];
-		}
-	}
-
-	void reset(const AccumulatorBuffer<ACCUMULATOR>* other = NULL)
-	{
-		llassert(mStorageSize >= sNextStorageSlot);
-		for (size_t i = 0; i < sNextStorageSlot; i++)
-		{
-			mStorage[i].reset(other ? &other->mStorage[i] : NULL);
-		}
-	}
-
-	void flush(LLUnitImplicit<F64, LLUnits::Seconds> time_stamp)
-	{
-		llassert(mStorageSize >= sNextStorageSlot);
-		for (size_t i = 0; i < sNextStorageSlot; i++)
-		{
-			mStorage[i].flush(time_stamp);
-		}
-	}
-
-	void makePrimary()
-	{
-		LLThreadLocalSingletonPointer<ACCUMULATOR>::setInstance(mStorage);
-	}
-
-	bool isPrimary() const
-	{
-		return LLThreadLocalSingletonPointer<ACCUMULATOR>::getInstance() == mStorage;
-	}
-
-	LL_FORCE_INLINE static ACCUMULATOR* getPrimaryStorage() 
-	{ 
-		ACCUMULATOR* accumulator = LLThreadLocalSingletonPointer<ACCUMULATOR>::getInstance();
-		return accumulator ? accumulator : getDefaultBuffer()->mStorage;
-	}
-
-	// NOTE: this is not thread-safe.  We assume that slots are reserved in the main thread before any child threads are spawned
-	size_t reserveSlot()
-	{
-#ifndef LL_RELEASE_FOR_DOWNLOAD
-		if (LLTrace::isInitialized())
-		{
-			llerrs << "Attempting to declare trace object after program initialization.  Trace objects should be statically initialized." << llendl;
-		}
-#endif
-		size_t next_slot = sNextStorageSlot++;
-		if (next_slot >= mStorageSize)
-		{
-			resize(mStorageSize + (mStorageSize >> 2));
-		}
-		llassert(mStorage && next_slot < mStorageSize);
-		return next_slot;
-	}
-
-	void resize(size_t new_size)
-	{
-		if (new_size <= mStorageSize) return;
-
-		ACCUMULATOR* old_storage = mStorage;
-		mStorage = new ACCUMULATOR[new_size];
-		if (old_storage)
-		{
-			for (S32 i = 0; i < mStorageSize; i++)
-			{
-				mStorage[i] = old_storage[i];
-			}
-		}
-		mStorageSize = new_size;
-		delete[] old_storage;
-
-		self_t* default_buffer = getDefaultBuffer();
-		if (this != default_buffer
-			&& new_size > default_buffer->size())
-		{
-			//NB: this is not thread safe, but we assume that all resizing occurs during static initialization
-			default_buffer->resize(new_size);
-		}
-	}
-
-	size_t size() const
-	{
-		return getNumIndices();
-	}
-
-	static size_t getNumIndices() 
-	{
-		return sNextStorageSlot;
-	}
-
-	static self_t* getDefaultBuffer()
-	{
-		static bool sInitialized = false;
-		if (!sInitialized)
-		{
-			// this buffer is allowed to leak so that trace calls from global destructors have somewhere to put their data
-			// so as not to trigger an access violation
-			sDefaultBuffer = new AccumulatorBuffer(StaticAllocationMarker());
-			sInitialized = true;
-			sDefaultBuffer->resize(DEFAULT_ACCUMULATOR_BUFFER_SIZE);
-		}
-		return sDefaultBuffer;
-	}
-
-private:
-	ACCUMULATOR*	mStorage;
-	size_t			mStorageSize;
-	static size_t	sNextStorageSlot;
-	static self_t*	sDefaultBuffer;
-};
-
-template<typename ACCUMULATOR> size_t AccumulatorBuffer<ACCUMULATOR>::sNextStorageSlot = 0;
-template<typename ACCUMULATOR> AccumulatorBuffer<ACCUMULATOR>* AccumulatorBuffer<ACCUMULATOR>::sDefaultBuffer = NULL;
-
 template<typename ACCUMULATOR>
 class TraceType 
 :	 public LLInstanceTracker<TraceType<ACCUMULATOR>, std::string>
@@ -267,344 +88,6 @@ class TraceType
 	const size_t		mAccumulatorIndex;
 };
 
-class EventAccumulator
-{
-public:
-	typedef F64 value_t;
-	typedef F64 mean_t;
-
-	EventAccumulator()
-	:	mSum(0),
-		mMin((std::numeric_limits<F64>::max)()),
-		mMax((std::numeric_limits<F64>::min)()),
-		mMean(0),
-		mVarianceSum(0),
-		mNumSamples(0),
-		mLastValue(0)
-	{}
-
-	void record(F64 value)
-	{
-		mNumSamples++;
-		mSum += value;
-		// NOTE: both conditions will hold on first pass through
-		if (value < mMin)
-		{
-			mMin = value;
-		}
-		if (value > mMax)
-		{
-			mMax = value;
-		}
-		F64 old_mean = mMean;
-		mMean += (value - old_mean) / (F64)mNumSamples;
-		mVarianceSum += (value - old_mean) * (value - mMean);
-		mLastValue = value;
-	}
-
-	void addSamples(const EventAccumulator& other, bool append)
-	{
-		if (other.mNumSamples)
-		{
-			mSum += other.mSum;
-
-			// NOTE: both conditions will hold first time through
-			if (other.mMin < mMin) { mMin = other.mMin; }
-			if (other.mMax > mMax) { mMax = other.mMax; }
-
-			// combine variance (and hence standard deviation) of 2 different sized sample groups using
-			// the following formula: http://www.mrc-bsu.cam.ac.uk/cochrane/handbook/chapter_7/7_7_3_8_combining_groups.htm
-			F64 n_1 = (F64)mNumSamples,
-				n_2 = (F64)other.mNumSamples;
-			F64 m_1 = mMean,
-				m_2 = other.mMean;
-			F64 v_1 = mVarianceSum / mNumSamples,
-				v_2 = other.mVarianceSum / other.mNumSamples;
-			if (n_1 == 0)
-			{
-				mVarianceSum = other.mVarianceSum;
-			}
-			else if (n_2 == 0)
-			{
-				// don't touch variance
-				// mVarianceSum = mVarianceSum;
-			}
-			else
-			{
-				mVarianceSum = (F64)mNumSamples
-								* ((((n_1 - 1.f) * v_1)
-									+ ((n_2 - 1.f) * v_2)
-									+ (((n_1 * n_2) / (n_1 + n_2))
-										* ((m_1 * m_1) + (m_2 * m_2) - (2.f * m_1 * m_2))))
-									/ (n_1 + n_2 - 1.f));
-			}
-
-			F64 weight = (F64)mNumSamples / (F64)(mNumSamples + other.mNumSamples);
-			mNumSamples += other.mNumSamples;
-			mMean = mMean * weight + other.mMean * (1.f - weight);
-			if (append) mLastValue = other.mLastValue;
-		}
-	}
-
-	void reset(const EventAccumulator* other)
-	{
-		mNumSamples = 0;
-		mSum = 0;
-		mMin = std::numeric_limits<F64>::max();
-		mMax = std::numeric_limits<F64>::min();
-		mMean = 0;
-		mVarianceSum = 0;
-		mLastValue = other ? other->mLastValue : 0;
-	}
-
-	void flush(LLUnitImplicit<F64, LLUnits::Seconds>) {}
-
-	F64	getSum() const { return mSum; }
-	F64	getMin() const { return mMin; }
-	F64	getMax() const { return mMax; }
-	F64	getLastValue() const { return mLastValue; }
-	F64	getMean() const { return mMean; }
-	F64 getStandardDeviation() const { return sqrtf(mVarianceSum / mNumSamples); }
-	U32 getSampleCount() const { return mNumSamples; }
-
-private:
-	F64	mSum,
-		mMin,
-		mMax,
-		mLastValue;
-
-	F64	mMean,
-		mVarianceSum;
-
-	U32	mNumSamples;
-};
-
-
-class SampleAccumulator
-{
-public:
-	typedef F64 value_t;
-	typedef F64 mean_t;
-
-	SampleAccumulator()
-	:	mSum(0),
-		mMin((std::numeric_limits<F64>::max)()),
-		mMax((std::numeric_limits<F64>::min)()),
-		mMean(0),
-		mVarianceSum(0),
-		mLastSampleTimeStamp(LLTimer::getTotalSeconds()),
-		mTotalSamplingTime(0),
-		mNumSamples(0),
-		mLastValue(0),
-		mHasValue(false)
-	{}
-
-	void sample(F64 value)
-	{
-		LLUnitImplicit<F64, LLUnits::Seconds> time_stamp = LLTimer::getTotalSeconds();
-		LLUnitImplicit<F64, LLUnits::Seconds> delta_time = time_stamp - mLastSampleTimeStamp;
-		mLastSampleTimeStamp = time_stamp;
-
-		if (mHasValue)
-		{
-			mTotalSamplingTime += delta_time;
-			mSum += mLastValue * delta_time;
-
-			// NOTE: both conditions will hold first time through
-			if (value < mMin) { mMin = value; }
-			if (value > mMax) { mMax = value; }
-
-			F64 old_mean = mMean;
-			mMean += (delta_time / mTotalSamplingTime) * (mLastValue - old_mean);
-			mVarianceSum += delta_time * (mLastValue - old_mean) * (mLastValue - mMean);
-		}
-
-		mLastValue = value;
-		mNumSamples++;
-		mHasValue = true;
-	}
-
-	void addSamples(const SampleAccumulator& other, bool append)
-	{
-		if (other.mTotalSamplingTime)
-		{
-			mSum += other.mSum;
-
-			// NOTE: both conditions will hold first time through
-			if (other.mMin < mMin) { mMin = other.mMin; }
-			if (other.mMax > mMax) { mMax = other.mMax; }
-
-			// combine variance (and hence standard deviation) of 2 different sized sample groups using
-			// the following formula: http://www.mrc-bsu.cam.ac.uk/cochrane/handbook/chapter_7/7_7_3_8_combining_groups.htm
-			F64 n_1 = mTotalSamplingTime,
-				n_2 = other.mTotalSamplingTime;
-			F64 m_1 = mMean,
-				m_2 = other.mMean;
-			F64 v_1 = mVarianceSum / mTotalSamplingTime,
-				v_2 = other.mVarianceSum / other.mTotalSamplingTime;
-			if (n_1 == 0)
-			{
-				mVarianceSum = other.mVarianceSum;
-			}
-			else if (n_2 == 0)
-			{
-				// variance is unchanged
-				// mVarianceSum = mVarianceSum;
-			}
-			else
-			{
-				mVarianceSum =	mTotalSamplingTime
-								* ((((n_1 - 1.f) * v_1)
-									+ ((n_2 - 1.f) * v_2)
-									+ (((n_1 * n_2) / (n_1 + n_2))
-										* ((m_1 * m_1) + (m_2 * m_2) - (2.f * m_1 * m_2))))
-									/ (n_1 + n_2 - 1.f));
-			}
-
-			llassert(other.mTotalSamplingTime > 0);
-			F64 weight = mTotalSamplingTime / (mTotalSamplingTime + other.mTotalSamplingTime);
-			mNumSamples += other.mNumSamples;
-			mTotalSamplingTime += other.mTotalSamplingTime;
-			mMean = (mMean * weight) + (other.mMean * (1.0 - weight));
-			if (append)
-			{
-				mLastValue = other.mLastValue;
-				mLastSampleTimeStamp = other.mLastSampleTimeStamp;
-				mHasValue |= other.mHasValue;
-			}
-		}
-	}
-
-	void reset(const SampleAccumulator* other)
-	{
-		mNumSamples = 0;
-		mSum = 0;
-		mMin = std::numeric_limits<F64>::max();
-		mMax = std::numeric_limits<F64>::min();
-		mMean = other ? other->mLastValue : 0;
-		mVarianceSum = 0;
-		mLastSampleTimeStamp = LLTimer::getTotalSeconds();
-		mTotalSamplingTime = 0;
-		mLastValue = other ? other->mLastValue : 0;
-		mHasValue = other ? other->mHasValue : false;
-	}
-
-	void flush(LLUnitImplicit<F64, LLUnits::Seconds> time_stamp)
-	{
-		LLUnitImplicit<F64, LLUnits::Seconds> delta_time = time_stamp - mLastSampleTimeStamp;
-
-		if (mHasValue)
-		{
-			mSum += mLastValue * delta_time;
-			mTotalSamplingTime += delta_time;
-		}
-		mLastSampleTimeStamp = time_stamp;
-	}
-
-	F64	getSum() const { return mSum; }
-	F64	getMin() const { return mMin; }
-	F64	getMax() const { return mMax; }
-	F64	getLastValue() const { return mLastValue; }
-	F64	getMean() const { return mMean; }
-	F64 getStandardDeviation() const { return sqrtf(mVarianceSum / mTotalSamplingTime); }
-	U32 getSampleCount() const { return mNumSamples; }
-
-private:
-	F64	mSum,
-		mMin,
-		mMax,
-		mLastValue;
-
-	bool mHasValue;
-
-	F64	mMean,
-		mVarianceSum;
-
-	LLUnitImplicit<F64, LLUnits::Seconds>	mLastSampleTimeStamp,
-											mTotalSamplingTime;
-
-	U32	mNumSamples;
-};
-
-class CountAccumulator
-{
-public:
-	typedef F64 value_t;
-	typedef F64 mean_t;
-
-	CountAccumulator()
-	:	mSum(0),
-		mNumSamples(0)
-	{}
-
-	void add(F64 value)
-	{
-		mNumSamples++;
-		mSum += value;
-	}
-
-	void addSamples(const CountAccumulator& other, bool /*append*/)
-	{
-		mSum += other.mSum;
-		mNumSamples += other.mNumSamples;
-	}
-
-	void reset(const CountAccumulator* other)
-	{
-		mNumSamples = 0;
-		mSum = 0;
-	}
-
-	void flush(LLUnitImplicit<F64, LLUnits::Seconds>) {}
-
-	F64	getSum() const { return mSum; }
-
-	U32 getSampleCount() const { return mNumSamples; }
-
-private:
-	F64	mSum;
-
-	U32	mNumSamples;
-};
-
-class TimeBlockAccumulator
-{
-public:
-	typedef LLUnit<F64, LLUnits::Seconds> value_t;
-	typedef LLUnit<F64, LLUnits::Seconds> mean_t;
-	typedef TimeBlockAccumulator self_t;
-
-	// fake classes that allows us to view different facets of underlying statistic
-	struct CallCountFacet 
-	{
-		typedef U32 value_t;
-		typedef F32 mean_t;
-	};
-
-	struct SelfTimeFacet
-	{
-		typedef LLUnit<F64, LLUnits::Seconds> value_t;
-		typedef LLUnit<F64, LLUnits::Seconds> mean_t;
-	};
-
-	TimeBlockAccumulator();
-	void addSamples(const self_t& other, bool /*append*/);
-	void reset(const self_t* other);
-	void flush(LLUnitImplicit<F64, LLUnits::Seconds>) {}
-
-	//
-	// members
-	//
-	U64							mStartTotalTimeCounter,
-								mTotalTimeCounter,
-								mSelfTimeCounter;
-	U32							mCalls;
-	class TimeBlock*			mParent;		// last acknowledged parent of this time block
-	class TimeBlock*			mLastCaller;	// used to bootstrap tree construction
-	U16							mActiveCount;	// number of timers with this ID active on stack
-	bool						mMoveUpTree;	// needs to be moved up the tree of timers at the end of frame
-
-};
 
 template<>
 class TraceType<TimeBlockAccumulator::CallCountFacet>
@@ -628,23 +111,6 @@ class TraceType<TimeBlockAccumulator::SelfTimeFacet>
 	{}
 };
 
-class TimeBlock;
-class TimeBlockTreeNode
-{
-public:
-	TimeBlockTreeNode();
-
-	void setParent(TimeBlock* parent);
-	TimeBlock* getParent() { return mParent; }
-
-	TimeBlock*					mBlock;
-	TimeBlock*					mParent;	
-	std::vector<TimeBlock*>		mChildren;
-	bool						mCollapsed;
-	bool						mNeedsSorting;
-};
-
-
 template <typename T = F64>
 class EventStatHandle
 :	public TraceType<EventAccumulator>
@@ -712,64 +178,6 @@ void add(CountStatHandle<T>& count, VALUE_T value)
 	count.getPrimaryAccumulator()->add(storage_value(converted_value));
 }
 
-
-struct MemStatAccumulator
-{
-	typedef MemStatAccumulator self_t;
-
-	// fake classes that allows us to view different facets of underlying statistic
-	struct AllocationCountFacet 
-	{
-		typedef U32 value_t;
-		typedef F32 mean_t;
-	};
-
-	struct DeallocationCountFacet 
-	{
-		typedef U32 value_t;
-		typedef F32 mean_t;
-	};
-
-	struct ChildMemFacet
-	{
-		typedef LLUnit<F64, LLUnits::Bytes> value_t;
-		typedef LLUnit<F64, LLUnits::Bytes> mean_t;
-	};
-
-	MemStatAccumulator()
-	:	mAllocatedCount(0),
-		mDeallocatedCount(0)
-	{}
-
-	void addSamples(const MemStatAccumulator& other, bool append)
-	{
-		mSize.addSamples(other.mSize, append);
-		mChildSize.addSamples(other.mChildSize, append);
-		mAllocatedCount += other.mAllocatedCount;
-		mDeallocatedCount += other.mDeallocatedCount;
-	}
-
-	void reset(const MemStatAccumulator* other)
-	{
-		mSize.reset(other ? &other->mSize : NULL);
-		mChildSize.reset(other ? &other->mChildSize : NULL);
-		mAllocatedCount = 0;
-		mDeallocatedCount = 0;
-	}
-
-	void flush(LLUnitImplicit<F64, LLUnits::Seconds> time_stamp) 
-	{
-		mSize.flush(time_stamp);
-		mChildSize.flush(time_stamp);
-	}
-
-	SampleAccumulator	mSize,
-						mChildSize;
-	int					mAllocatedCount,
-						mDeallocatedCount;
-};
-
-
 template<>
 class TraceType<MemStatAccumulator::AllocationCountFacet>
 :	public TraceType<MemStatAccumulator>
diff --git a/indra/llcommon/lltraceaccumulators.cpp b/indra/llcommon/lltraceaccumulators.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..950c1d97d11addc90e38c45390ef0010ef1cf892
--- /dev/null
+++ b/indra/llcommon/lltraceaccumulators.cpp
@@ -0,0 +1,123 @@
+/** 
+ * @file lltracesampler.cpp
+ *
+ * $LicenseInfo:firstyear=2001&license=viewerlgpl$
+ * Second Life Viewer Source Code
+ * Copyright (C) 2012, Linden Research, Inc.
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License only.
+ * 
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * 
+ * Linden Research, Inc., 945 Battery Street, San Francisco, CA  94111  USA
+ * $/LicenseInfo$
+ */
+
+#include "linden_common.h"
+
+#include "lltraceaccumulators.h"
+#include "lltracethreadrecorder.h"
+
+namespace LLTrace
+{
+
+
+///////////////////////////////////////////////////////////////////////
+// AccumulatorBufferGroup
+///////////////////////////////////////////////////////////////////////
+
+AccumulatorBufferGroup::AccumulatorBufferGroup() 
+{}
+
+void AccumulatorBufferGroup::handOffTo(AccumulatorBufferGroup& other)
+{
+	other.mCounts.reset(&mCounts);
+	other.mSamples.reset(&mSamples);
+	other.mEvents.reset(&mEvents);
+	other.mStackTimers.reset(&mStackTimers);
+	other.mMemStats.reset(&mMemStats);
+}
+
+void AccumulatorBufferGroup::makePrimary()
+{
+	mCounts.makePrimary();
+	mSamples.makePrimary();
+	mEvents.makePrimary();
+	mStackTimers.makePrimary();
+	mMemStats.makePrimary();
+
+	ThreadRecorder* thread_recorder = get_thread_recorder().get();
+	AccumulatorBuffer<TimeBlockAccumulator>& timer_accumulator_buffer = mStackTimers;
+	// update stacktimer parent pointers
+	for (S32 i = 0, end_i = mStackTimers.size(); i < end_i; i++)
+	{
+		TimeBlockTreeNode* tree_node = thread_recorder->getTimeBlockTreeNode(i);
+		if (tree_node)
+		{
+			timer_accumulator_buffer[i].mParent = tree_node->mParent;
+		}
+	}
+}
+
+//static
+void AccumulatorBufferGroup::clearPrimary()
+{
+	AccumulatorBuffer<CountAccumulator>::clearPrimary();	
+	AccumulatorBuffer<SampleAccumulator>::clearPrimary();
+	AccumulatorBuffer<EventAccumulator>::clearPrimary();
+	AccumulatorBuffer<TimeBlockAccumulator>::clearPrimary();
+	AccumulatorBuffer<MemStatAccumulator>::clearPrimary();
+}
+
+bool AccumulatorBufferGroup::isPrimary() const
+{
+	return mCounts.isPrimary();
+}
+
+void AccumulatorBufferGroup::append( const AccumulatorBufferGroup& other )
+{
+	mCounts.addSamples(other.mCounts);
+	mSamples.addSamples(other.mSamples);
+	mEvents.addSamples(other.mEvents);
+	mMemStats.addSamples(other.mMemStats);
+	mStackTimers.addSamples(other.mStackTimers);
+}
+
+void AccumulatorBufferGroup::merge( const AccumulatorBufferGroup& other)
+{
+	mCounts.addSamples(other.mCounts, false);
+	mSamples.addSamples(other.mSamples, false);
+	mEvents.addSamples(other.mEvents, false);
+	mMemStats.addSamples(other.mMemStats, false);
+	// for now, hold out timers from merge, need to be displayed per thread
+	//mStackTimers.addSamples(other.mStackTimers, false);
+}
+
+void AccumulatorBufferGroup::reset(AccumulatorBufferGroup* other)
+{
+	mCounts.reset(other ? &other->mCounts : NULL);
+	mSamples.reset(other ? &other->mSamples : NULL);
+	mEvents.reset(other ? &other->mEvents : NULL);
+	mStackTimers.reset(other ? &other->mStackTimers : NULL);
+	mMemStats.reset(other ? &other->mMemStats : NULL);
+}
+
+void AccumulatorBufferGroup::sync()
+{
+	LLUnitImplicit<F64, LLUnits::Seconds> time_stamp = LLTimer::getTotalSeconds();
+
+	mSamples.sync(time_stamp);
+	mMemStats.sync(time_stamp);
+}
+
+}
diff --git a/indra/llcommon/lltraceaccumulators.h b/indra/llcommon/lltraceaccumulators.h
new file mode 100644
index 0000000000000000000000000000000000000000..825cc9e3a8ea370a198d4d85aa17d1e00203e4e1
--- /dev/null
+++ b/indra/llcommon/lltraceaccumulators.h
@@ -0,0 +1,661 @@
+/** 
+ * @file lltraceaccumulators.h
+ * @brief Storage for accumulating statistics
+ *
+ * $LicenseInfo:firstyear=2001&license=viewerlgpl$
+ * Second Life Viewer Source Code
+ * Copyright (C) 2012, Linden Research, Inc.
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License only.
+ * 
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ * 
+ * Linden Research, Inc., 945 Battery Street, San Francisco, CA  94111  USA
+ * $/LicenseInfo$
+ */
+
+#ifndef LL_LLTRACEACCUMULATORS_H
+#define LL_LLTRACEACCUMULATORS_H
+
+
+#include "stdtypes.h"
+#include "llpreprocessor.h"
+#include "llunit.h"
+#include "lltimer.h"
+#include "llrefcount.h"
+
+namespace LLTrace
+{
+
+	template<typename ACCUMULATOR>
+	class AccumulatorBuffer : public LLRefCount
+	{
+		typedef AccumulatorBuffer<ACCUMULATOR> self_t;
+		static const U32 DEFAULT_ACCUMULATOR_BUFFER_SIZE = 64;
+	private:
+		struct StaticAllocationMarker { };
+
+		AccumulatorBuffer(StaticAllocationMarker m)
+		:	mStorageSize(0),
+			mStorage(NULL)
+		{}
+
+	public:
+
+		AccumulatorBuffer(const AccumulatorBuffer& other = *getDefaultBuffer())
+		:	mStorageSize(0),
+			mStorage(NULL)
+		{
+			resize(other.mStorageSize);
+			for (S32 i = 0; i < sNextStorageSlot; i++)
+			{
+				mStorage[i] = other.mStorage[i];
+			}
+		}
+
+		~AccumulatorBuffer()
+		{
+			if (isPrimary())
+			{
+				LLThreadLocalSingletonPointer<ACCUMULATOR>::setInstance(NULL);
+			}
+			delete[] mStorage;
+		}
+
+		LL_FORCE_INLINE ACCUMULATOR& operator[](size_t index) 
+		{ 
+			return mStorage[index]; 
+		}
+
+		LL_FORCE_INLINE const ACCUMULATOR& operator[](size_t index) const
+		{ 
+			return mStorage[index]; 
+		}
+
+		void addSamples(const AccumulatorBuffer<ACCUMULATOR>& other, bool append = true)
+		{
+			llassert(mStorageSize >= sNextStorageSlot && other.mStorageSize > sNextStorageSlot);
+			for (size_t i = 0; i < sNextStorageSlot; i++)
+			{
+				mStorage[i].addSamples(other.mStorage[i], append);
+			}
+		}
+
+		void copyFrom(const AccumulatorBuffer<ACCUMULATOR>& other)
+		{
+			llassert(mStorageSize >= sNextStorageSlot && other.mStorageSize > sNextStorageSlot);
+			for (size_t i = 0; i < sNextStorageSlot; i++)
+			{
+				mStorage[i] = other.mStorage[i];
+			}
+		}
+
+		void reset(const AccumulatorBuffer<ACCUMULATOR>* other = NULL)
+		{
+			llassert(mStorageSize >= sNextStorageSlot);
+			for (size_t i = 0; i < sNextStorageSlot; i++)
+			{
+				mStorage[i].reset(other ? &other->mStorage[i] : NULL);
+			}
+		}
+
+		void sync(LLUnitImplicit<F64, LLUnits::Seconds> time_stamp)
+		{
+			llassert(mStorageSize >= sNextStorageSlot);
+			for (size_t i = 0; i < sNextStorageSlot; i++)
+			{
+				mStorage[i].sync(time_stamp);
+			}
+		}
+
+		void makePrimary()
+		{
+			LLThreadLocalSingletonPointer<ACCUMULATOR>::setInstance(mStorage);
+		}
+
+		bool isPrimary() const
+		{
+			return LLThreadLocalSingletonPointer<ACCUMULATOR>::getInstance() == mStorage;
+		}
+
+		static void clearPrimary()
+		{
+			LLThreadLocalSingletonPointer<ACCUMULATOR>::setInstance(NULL);
+		}
+
+		LL_FORCE_INLINE static ACCUMULATOR* getPrimaryStorage() 
+		{ 
+			ACCUMULATOR* accumulator = LLThreadLocalSingletonPointer<ACCUMULATOR>::getInstance();
+			return accumulator ? accumulator : getDefaultBuffer()->mStorage;
+		}
+
+		// NOTE: this is not thread-safe.  We assume that slots are reserved in the main thread before any child threads are spawned
+		size_t reserveSlot()
+		{
+#ifndef LL_RELEASE_FOR_DOWNLOAD
+			if (LLTrace::isInitialized())
+			{
+				llerrs << "Attempting to declare trace object after program initialization.  Trace objects should be statically initialized." << llendl;
+			}
+#endif
+			size_t next_slot = sNextStorageSlot++;
+			if (next_slot >= mStorageSize)
+			{
+				resize(mStorageSize + (mStorageSize >> 2));
+			}
+			llassert(mStorage && next_slot < mStorageSize);
+			return next_slot;
+		}
+
+		void resize(size_t new_size)
+		{
+			if (new_size <= mStorageSize) return;
+
+			ACCUMULATOR* old_storage = mStorage;
+			mStorage = new ACCUMULATOR[new_size];
+			if (old_storage)
+			{
+				for (S32 i = 0; i < mStorageSize; i++)
+				{
+					mStorage[i] = old_storage[i];
+				}
+			}
+			mStorageSize = new_size;
+			delete[] old_storage;
+
+			self_t* default_buffer = getDefaultBuffer();
+			if (this != default_buffer
+				&& new_size > default_buffer->size())
+			{
+				//NB: this is not thread safe, but we assume that all resizing occurs during static initialization
+				default_buffer->resize(new_size);
+			}
+		}
+
+		size_t size() const
+		{
+			return getNumIndices();
+		}
+
+		static size_t getNumIndices() 
+		{
+			return sNextStorageSlot;
+		}
+
+		static self_t* getDefaultBuffer()
+		{
+			static bool sInitialized = false;
+			if (!sInitialized)
+			{
+				// this buffer is allowed to leak so that trace calls from global destructors have somewhere to put their data
+				// so as not to trigger an access violation
+				sDefaultBuffer = new AccumulatorBuffer(StaticAllocationMarker());
+				sInitialized = true;
+				sDefaultBuffer->resize(DEFAULT_ACCUMULATOR_BUFFER_SIZE);
+			}
+			return sDefaultBuffer;
+		}
+
+	private:
+		ACCUMULATOR*	mStorage;
+		size_t			mStorageSize;
+		static size_t	sNextStorageSlot;
+		static self_t*	sDefaultBuffer;
+	};
+
+	template<typename ACCUMULATOR> size_t AccumulatorBuffer<ACCUMULATOR>::sNextStorageSlot = 0;
+	template<typename ACCUMULATOR> AccumulatorBuffer<ACCUMULATOR>* AccumulatorBuffer<ACCUMULATOR>::sDefaultBuffer = NULL;
+
+
+	class EventAccumulator
+	{
+	public:
+		typedef F64 value_t;
+		typedef F64 mean_t;
+
+		EventAccumulator()
+		:	mSum(0),
+			mMin((std::numeric_limits<F64>::max)()),
+			mMax((std::numeric_limits<F64>::min)()),
+			mMean(0),
+			mVarianceSum(0),
+			mNumSamples(0),
+			mLastValue(0)
+		{}
+
+		void record(F64 value)
+		{
+			mNumSamples++;
+			mSum += value;
+			// NOTE: both conditions will hold on first pass through
+			if (value < mMin)
+			{
+				mMin = value;
+			}
+			if (value > mMax)
+			{
+				mMax = value;
+			}
+			F64 old_mean = mMean;
+			mMean += (value - old_mean) / (F64)mNumSamples;
+			mVarianceSum += (value - old_mean) * (value - mMean);
+			mLastValue = value;
+		}
+
+		void addSamples(const EventAccumulator& other, bool append)
+		{
+			if (other.mNumSamples)
+			{
+				mSum += other.mSum;
+
+				// NOTE: both conditions will hold first time through
+				if (other.mMin < mMin) { mMin = other.mMin; }
+				if (other.mMax > mMax) { mMax = other.mMax; }
+
+				// combine variance (and hence standard deviation) of 2 different sized sample groups using
+				// the following formula: http://www.mrc-bsu.cam.ac.uk/cochrane/handbook/chapter_7/7_7_3_8_combining_groups.htm
+				F64 n_1 = (F64)mNumSamples,
+					n_2 = (F64)other.mNumSamples;
+				F64 m_1 = mMean,
+					m_2 = other.mMean;
+				F64 v_1 = mVarianceSum / mNumSamples,
+					v_2 = other.mVarianceSum / other.mNumSamples;
+				if (n_1 == 0)
+				{
+					mVarianceSum = other.mVarianceSum;
+				}
+				else if (n_2 == 0)
+				{
+					// don't touch variance
+					// mVarianceSum = mVarianceSum;
+				}
+				else
+				{
+					mVarianceSum = (F64)mNumSamples
+						* ((((n_1 - 1.f) * v_1)
+						+ ((n_2 - 1.f) * v_2)
+						+ (((n_1 * n_2) / (n_1 + n_2))
+						* ((m_1 * m_1) + (m_2 * m_2) - (2.f * m_1 * m_2))))
+						/ (n_1 + n_2 - 1.f));
+				}
+
+				F64 weight = (F64)mNumSamples / (F64)(mNumSamples + other.mNumSamples);
+				mNumSamples += other.mNumSamples;
+				mMean = mMean * weight + other.mMean * (1.f - weight);
+				if (append) mLastValue = other.mLastValue;
+			}
+		}
+
+		void reset(const EventAccumulator* other)
+		{
+			mNumSamples = 0;
+			mSum = 0;
+			mMin = std::numeric_limits<F64>::max();
+			mMax = std::numeric_limits<F64>::min();
+			mMean = 0;
+			mVarianceSum = 0;
+			mLastValue = other ? other->mLastValue : 0;
+		}
+
+		void sync(LLUnitImplicit<F64, LLUnits::Seconds>) {}
+
+		F64	getSum() const { return mSum; }
+		F64	getMin() const { return mMin; }
+		F64	getMax() const { return mMax; }
+		F64	getLastValue() const { return mLastValue; }
+		F64	getMean() const { return mMean; }
+		F64 getStandardDeviation() const { return sqrtf(mVarianceSum / mNumSamples); }
+		U32 getSampleCount() const { return mNumSamples; }
+
+	private:
+		F64	mSum,
+			mMin,
+			mMax,
+			mLastValue;
+
+		F64	mMean,
+			mVarianceSum;
+
+		U32	mNumSamples;
+	};
+
+
+	class SampleAccumulator
+	{
+	public:
+		typedef F64 value_t;
+		typedef F64 mean_t;
+
+		SampleAccumulator()
+		:	mSum(0),
+			mMin((std::numeric_limits<F64>::max)()),
+			mMax((std::numeric_limits<F64>::min)()),
+			mMean(0),
+			mVarianceSum(0),
+			mLastSampleTimeStamp(LLTimer::getTotalSeconds()),
+			mTotalSamplingTime(0),
+			mNumSamples(0),
+			mLastValue(0),
+			mHasValue(false)
+		{}
+
+		void sample(F64 value)
+		{
+			LLUnitImplicit<F64, LLUnits::Seconds> time_stamp = LLTimer::getTotalSeconds();
+			LLUnitImplicit<F64, LLUnits::Seconds> delta_time = time_stamp - mLastSampleTimeStamp;
+			mLastSampleTimeStamp = time_stamp;
+
+			if (mHasValue)
+			{
+				mTotalSamplingTime += delta_time;
+				mSum += mLastValue * delta_time;
+
+				// NOTE: both conditions will hold first time through
+				if (value < mMin) { mMin = value; }
+				if (value > mMax) { mMax = value; }
+
+				F64 old_mean = mMean;
+				mMean += (delta_time / mTotalSamplingTime) * (mLastValue - old_mean);
+				mVarianceSum += delta_time * (mLastValue - old_mean) * (mLastValue - mMean);
+			}
+
+			mLastValue = value;
+			mNumSamples++;
+			mHasValue = true;
+		}
+
+		void addSamples(const SampleAccumulator& other, bool append)
+		{
+			if (other.mTotalSamplingTime)
+			{
+				mSum += other.mSum;
+
+				// NOTE: both conditions will hold first time through
+				if (other.mMin < mMin) { mMin = other.mMin; }
+				if (other.mMax > mMax) { mMax = other.mMax; }
+
+				// combine variance (and hence standard deviation) of 2 different sized sample groups using
+				// the following formula: http://www.mrc-bsu.cam.ac.uk/cochrane/handbook/chapter_7/7_7_3_8_combining_groups.htm
+				F64 n_1 = mTotalSamplingTime,
+					n_2 = other.mTotalSamplingTime;
+				F64 m_1 = mMean,
+					m_2 = other.mMean;
+				F64 v_1 = mVarianceSum / mTotalSamplingTime,
+					v_2 = other.mVarianceSum / other.mTotalSamplingTime;
+				if (n_1 == 0)
+				{
+					mVarianceSum = other.mVarianceSum;
+				}
+				else if (n_2 == 0)
+				{
+					// variance is unchanged
+					// mVarianceSum = mVarianceSum;
+				}
+				else
+				{
+					mVarianceSum =	mTotalSamplingTime
+						* ((((n_1 - 1.f) * v_1)
+						+ ((n_2 - 1.f) * v_2)
+						+ (((n_1 * n_2) / (n_1 + n_2))
+						* ((m_1 * m_1) + (m_2 * m_2) - (2.f * m_1 * m_2))))
+						/ (n_1 + n_2 - 1.f));
+				}
+
+				llassert(other.mTotalSamplingTime > 0);
+				F64 weight = mTotalSamplingTime / (mTotalSamplingTime + other.mTotalSamplingTime);
+				mNumSamples += other.mNumSamples;
+				mTotalSamplingTime += other.mTotalSamplingTime;
+				mMean = (mMean * weight) + (other.mMean * (1.0 - weight));
+				if (append)
+				{
+					mLastValue = other.mLastValue;
+					mLastSampleTimeStamp = other.mLastSampleTimeStamp;
+					mHasValue |= other.mHasValue;
+				}
+			}
+		}
+
+		void reset(const SampleAccumulator* other)
+		{
+			mNumSamples = 0;
+			mSum = 0;
+			mMin = std::numeric_limits<F64>::max();
+			mMax = std::numeric_limits<F64>::min();
+			mMean = other ? other->mLastValue : 0;
+			mVarianceSum = 0;
+			mLastSampleTimeStamp = LLTimer::getTotalSeconds();
+			mTotalSamplingTime = 0;
+			mLastValue = other ? other->mLastValue : 0;
+			mHasValue = other ? other->mHasValue : false;
+		}
+
+		void sync(LLUnitImplicit<F64, LLUnits::Seconds> time_stamp)
+		{
+			LLUnitImplicit<F64, LLUnits::Seconds> delta_time = time_stamp - mLastSampleTimeStamp;
+
+			if (mHasValue)
+			{
+				mSum += mLastValue * delta_time;
+				mTotalSamplingTime += delta_time;
+			}
+			mLastSampleTimeStamp = time_stamp;
+		}
+
+		F64	getSum() const { return mSum; }
+		F64	getMin() const { return mMin; }
+		F64	getMax() const { return mMax; }
+		F64	getLastValue() const { return mLastValue; }
+		F64	getMean() const { return mMean; }
+		F64 getStandardDeviation() const { return sqrtf(mVarianceSum / mTotalSamplingTime); }
+		U32 getSampleCount() const { return mNumSamples; }
+
+	private:
+		F64	mSum,
+			mMin,
+			mMax,
+			mLastValue;
+
+		bool mHasValue;
+
+		F64	mMean,
+			mVarianceSum;
+
+		LLUnitImplicit<F64, LLUnits::Seconds>	mLastSampleTimeStamp,
+			mTotalSamplingTime;
+
+		U32	mNumSamples;
+	};
+
+	class CountAccumulator
+	{
+	public:
+		typedef F64 value_t;
+		typedef F64 mean_t;
+
+		CountAccumulator()
+		:	mSum(0),
+			mNumSamples(0)
+		{}
+
+		void add(F64 value)
+		{
+			mNumSamples++;
+			mSum += value;
+		}
+
+		void addSamples(const CountAccumulator& other, bool /*append*/)
+		{
+			mSum += other.mSum;
+			mNumSamples += other.mNumSamples;
+		}
+
+		void reset(const CountAccumulator* other)
+		{
+			mNumSamples = 0;
+			mSum = 0;
+		}
+
+		void sync(LLUnitImplicit<F64, LLUnits::Seconds>) {}
+
+		F64	getSum() const { return mSum; }
+
+		U32 getSampleCount() const { return mNumSamples; }
+
+	private:
+		F64	mSum;
+
+		U32	mNumSamples;
+	};
+
+	class TimeBlockAccumulator
+	{
+	public:
+		typedef LLUnit<F64, LLUnits::Seconds> value_t;
+		typedef LLUnit<F64, LLUnits::Seconds> mean_t;
+		typedef TimeBlockAccumulator self_t;
+
+		// fake classes that allows us to view different facets of underlying statistic
+		struct CallCountFacet 
+		{
+			typedef U32 value_t;
+			typedef F32 mean_t;
+		};
+
+		struct SelfTimeFacet
+		{
+			typedef LLUnit<F64, LLUnits::Seconds> value_t;
+			typedef LLUnit<F64, LLUnits::Seconds> mean_t;
+		};
+
+		TimeBlockAccumulator();
+		void addSamples(const self_t& other, bool /*append*/);
+		void reset(const self_t* other);
+		void sync(LLUnitImplicit<F64, LLUnits::Seconds>) {}
+
+		//
+		// members
+		//
+		U64							mStartTotalTimeCounter,
+			mTotalTimeCounter,
+			mSelfTimeCounter;
+		U32							mCalls;
+		class TimeBlock*			mParent;		// last acknowledged parent of this time block
+		class TimeBlock*			mLastCaller;	// used to bootstrap tree construction
+		U16							mActiveCount;	// number of timers with this ID active on stack
+		bool						mMoveUpTree;	// needs to be moved up the tree of timers at the end of frame
+
+	};
+
+	class TimeBlock;
+	class TimeBlockTreeNode
+	{
+	public:
+		TimeBlockTreeNode();
+
+		void setParent(TimeBlock* parent);
+		TimeBlock* getParent() { return mParent; }
+
+		TimeBlock*					mBlock;
+		TimeBlock*					mParent;	
+		std::vector<TimeBlock*>		mChildren;
+		bool						mCollapsed;
+		bool						mNeedsSorting;
+	};
+	
+	struct BlockTimerStackRecord
+	{
+		class BlockTimer*	mActiveTimer;
+		class TimeBlock*	mTimeBlock;
+		U64					mChildTime;
+	};
+
+	struct MemStatAccumulator
+	{
+		typedef MemStatAccumulator self_t;
+
+		// fake classes that allows us to view different facets of underlying statistic
+		struct AllocationCountFacet 
+		{
+			typedef U32 value_t;
+			typedef F32 mean_t;
+		};
+
+		struct DeallocationCountFacet 
+		{
+			typedef U32 value_t;
+			typedef F32 mean_t;
+		};
+
+		struct ChildMemFacet
+		{
+			typedef LLUnit<F64, LLUnits::Bytes> value_t;
+			typedef LLUnit<F64, LLUnits::Bytes> mean_t;
+		};
+
+		MemStatAccumulator()
+		:	mAllocatedCount(0),
+			mDeallocatedCount(0)
+		{}
+
+		void addSamples(const MemStatAccumulator& other, bool append)
+		{
+			mSize.addSamples(other.mSize, append);
+			mChildSize.addSamples(other.mChildSize, append);
+			mAllocatedCount += other.mAllocatedCount;
+			mDeallocatedCount += other.mDeallocatedCount;
+		}
+
+		void reset(const MemStatAccumulator* other)
+		{
+			mSize.reset(other ? &other->mSize : NULL);
+			mChildSize.reset(other ? &other->mChildSize : NULL);
+			mAllocatedCount = 0;
+			mDeallocatedCount = 0;
+		}
+
+		void sync(LLUnitImplicit<F64, LLUnits::Seconds> time_stamp) 
+		{
+			mSize.sync(time_stamp);
+			mChildSize.sync(time_stamp);
+		}
+
+		SampleAccumulator	mSize,
+							mChildSize;
+		int					mAllocatedCount,
+							mDeallocatedCount;
+	};
+
+	struct AccumulatorBufferGroup : public LLRefCount
+	{
+		AccumulatorBufferGroup();
+
+		void handOffTo(AccumulatorBufferGroup& other);
+		void makePrimary();
+		bool isPrimary() const;
+		static void clearPrimary();
+
+		void append(const AccumulatorBufferGroup& other);
+		void merge(const AccumulatorBufferGroup& other);
+		void reset(AccumulatorBufferGroup* other = NULL);
+		void sync();
+
+		AccumulatorBuffer<CountAccumulator>	 			mCounts;
+		AccumulatorBuffer<SampleAccumulator>			mSamples;
+		AccumulatorBuffer<EventAccumulator>				mEvents;
+		AccumulatorBuffer<TimeBlockAccumulator> 		mStackTimers;
+		AccumulatorBuffer<MemStatAccumulator> 			mMemStats;
+	};
+}
+
+#endif // LL_LLTRACEACCUMULATORS_H
+
diff --git a/indra/llcommon/lltracerecording.cpp b/indra/llcommon/lltracerecording.cpp
index d34434f16156c640103337b6f340c6e804aba68a..0938317eaaad9961f25602e80b7ec3339cd91f03 100644
--- a/indra/llcommon/lltracerecording.cpp
+++ b/indra/llcommon/lltracerecording.cpp
@@ -33,85 +33,7 @@
 
 namespace LLTrace
 {
-
-
-///////////////////////////////////////////////////////////////////////
-// RecordingBuffers
-///////////////////////////////////////////////////////////////////////
-
-RecordingBuffers::RecordingBuffers() 
-{}
-
-void RecordingBuffers::handOffTo(RecordingBuffers& other)
-{
-	other.mCounts.reset(&mCounts);
-	other.mSamples.reset(&mSamples);
-	other.mEvents.reset(&mEvents);
-	other.mStackTimers.reset(&mStackTimers);
-	other.mMemStats.reset(&mMemStats);
-}
-
-void RecordingBuffers::makePrimary()
-{
-	mCounts.makePrimary();
-	mSamples.makePrimary();
-	mEvents.makePrimary();
-	mStackTimers.makePrimary();
-	mMemStats.makePrimary();
-
-	ThreadRecorder* thread_recorder = get_thread_recorder().get();
-	AccumulatorBuffer<TimeBlockAccumulator>& timer_accumulator_buffer = mStackTimers;
-	// update stacktimer parent pointers
-	for (S32 i = 0, end_i = mStackTimers.size(); i < end_i; i++)
-	{
-		TimeBlockTreeNode* tree_node = thread_recorder->getTimeBlockTreeNode(i);
-		if (tree_node)
-		{
-			timer_accumulator_buffer[i].mParent = tree_node->mParent;
-		}
-	}
-}
-
-bool RecordingBuffers::isPrimary() const
-{
-	return mCounts.isPrimary();
-}
-
-void RecordingBuffers::append( const RecordingBuffers& other )
-{
-	mCounts.addSamples(other.mCounts);
-	mSamples.addSamples(other.mSamples);
-	mEvents.addSamples(other.mEvents);
-	mMemStats.addSamples(other.mMemStats);
-	mStackTimers.addSamples(other.mStackTimers);
-}
-
-void RecordingBuffers::merge( const RecordingBuffers& other)
-{
-	mCounts.addSamples(other.mCounts, false);
-	mSamples.addSamples(other.mSamples, false);
-	mEvents.addSamples(other.mEvents, false);
-	mMemStats.addSamples(other.mMemStats, false);
-	// for now, hold out timers from merge, need to be displayed per thread
-	//mStackTimers.addSamples(other.mStackTimers, false);
-}
-
-void RecordingBuffers::reset(RecordingBuffers* other)
-{
-	mCounts.reset(other ? &other->mCounts : NULL);
-	mSamples.reset(other ? &other->mSamples : NULL);
-	mEvents.reset(other ? &other->mEvents : NULL);
-	mStackTimers.reset(other ? &other->mStackTimers : NULL);
-	mMemStats.reset(other ? &other->mMemStats : NULL);
-}
-
-void RecordingBuffers::flush()
-{
-	LLUnitImplicit<F64, LLUnits::Seconds> time_stamp = LLTimer::getTotalSeconds();
-
-	mSamples.flush(time_stamp);
-}
-
+	
 ///////////////////////////////////////////////////////////////////////
 // Recording
 ///////////////////////////////////////////////////////////////////////
@@ -119,7 +41,7 @@ void RecordingBuffers::flush()
 Recording::Recording() 
 :	mElapsedSeconds(0)
 {
-	mBuffers = new RecordingBuffers();
+	mBuffers = new AccumulatorBufferGroup();
 }
 
 Recording::Recording( const Recording& other )
@@ -132,17 +54,17 @@ Recording& Recording::operator = (const Recording& other)
 	// this will allow us to seamlessly start without affecting any data we've acquired from other
 	setPlayState(PAUSED);
 
-	Recording& mutable_other = const_cast<Recording&>(other);
-	mutable_other.update();
+	const_cast<Recording&>(other).update();
 	EPlayState other_play_state = other.getPlayState();
 
-	mBuffers = mutable_other.mBuffers;
-
-	LLStopWatchControlsMixin<Recording>::setPlayState(other_play_state);
+	mBuffers = other.mBuffers;
 
 	// above call will clear mElapsedSeconds as a side effect, so copy it here
 	mElapsedSeconds = other.mElapsedSeconds;
 	mSamplingTimer = other.mSamplingTimer;
+
+	setPlayState(other_play_state);
+
 	return *this;
 }
 
@@ -151,7 +73,7 @@ Recording::~Recording()
 {
 	if (isStarted() && LLTrace::get_thread_recorder().notNull())
 	{
-		LLTrace::get_thread_recorder()->deactivate(this);
+		LLTrace::get_thread_recorder()->deactivate(mBuffers.write());
 	}
 }
 
@@ -159,8 +81,10 @@ void Recording::update()
 {
 	if (isStarted())
 	{
-		mBuffers.write()->flush();
-		LLTrace::get_thread_recorder()->bringUpToDate(this);
+		mElapsedSeconds += mSamplingTimer.getElapsedTimeF64();
+		AccumulatorBufferGroup* buffers = mBuffers.write();
+		LLTrace::get_thread_recorder()->bringUpToDate(buffers);
+
 		mSamplingTimer.reset();
 	}
 }
@@ -176,14 +100,14 @@ void Recording::handleReset()
 void Recording::handleStart()
 {
 	mSamplingTimer.reset();
-	LLTrace::get_thread_recorder()->activate(this);
+	LLTrace::get_thread_recorder()->activate(mBuffers.write());
 }
 
 void Recording::handleStop()
 {
 	mElapsedSeconds += mSamplingTimer.getElapsedTimeF64();
-	mBuffers.write()->flush();
-	LLTrace::get_thread_recorder()->deactivate(this);
+	AccumulatorBufferGroup* buffers = mBuffers.write();
+	LLTrace::get_thread_recorder()->deactivate(buffers);
 }
 
 void Recording::handleSplitTo(Recording& other)
@@ -191,19 +115,14 @@ void Recording::handleSplitTo(Recording& other)
 	mBuffers.write()->handOffTo(*other.mBuffers.write());
 }
 
-void Recording::appendRecording( const Recording& other )
+void Recording::appendRecording( Recording& other )
 {
 	update();
+	other.update();
 	mBuffers.write()->append(*other.mBuffers);
 	mElapsedSeconds += other.mElapsedSeconds;
 }
 
-void Recording::mergeRecording( const Recording& other)
-{
-	update();
-	mBuffers.write()->merge(*other.mBuffers);
-}
-
 LLUnit<F64, LLUnits::Seconds> Recording::getSum(const TraceType<TimeBlockAccumulator>& stat)
 {
 	const TimeBlockAccumulator& accumulator = mBuffers->mStackTimers[stat.getIndex()];
@@ -710,8 +629,6 @@ F64 PeriodicRecording::getPeriodMean( const TraceType<SampleAccumulator>& stat,
 
 void ExtendableRecording::extend()
 {
-	// stop recording to get latest data
-	mPotentialRecording.update();
 	// push the data back to accepted recording
 	mAcceptedRecording.appendRecording(mPotentialRecording);
 	// flush data, so we can start from scratch
diff --git a/indra/llcommon/lltracerecording.h b/indra/llcommon/lltracerecording.h
index b839e85de09bd8cc6cec49b4261dde43792ea520..355dbabb1c11705c9248aa07e89fad2e2b48e75b 100644
--- a/indra/llcommon/lltracerecording.h
+++ b/indra/llcommon/lltracerecording.h
@@ -32,7 +32,7 @@
 
 #include "llpointer.h"
 #include "lltimer.h"
-#include "lltrace.h"
+#include "lltraceaccumulators.h"
 
 class LLStopWatchControlsMixinCommon
 {
@@ -81,6 +81,7 @@ class LLStopWatchControlsMixin
 :	public LLStopWatchControlsMixinCommon
 {
 public:
+
 	typedef LLStopWatchControlsMixin<DERIVED> self_t;
 	virtual void splitTo(DERIVED& other)
 	{
@@ -98,6 +99,11 @@ class LLStopWatchControlsMixin
 		static_cast<self_t&>(other).handleSplitTo(*static_cast<DERIVED*>(this));
 	}
 private:
+	self_t& operator = (const self_t& other)
+	{
+		// don't do anything, derived class must implement logic
+	}
+
 	// atomically stop this object while starting the other
 	// no data can be missed in between stop and start
 	virtual void handleSplitTo(DERIVED& other) {};
@@ -106,26 +112,6 @@ class LLStopWatchControlsMixin
 
 namespace LLTrace
 {
-	struct RecordingBuffers : public LLRefCount
-	{
-		RecordingBuffers();
-
-		void handOffTo(RecordingBuffers& other);
-		void makePrimary();
-		bool isPrimary() const;
-
-		void append(const RecordingBuffers& other);
-		void merge(const RecordingBuffers& other);
-		void reset(RecordingBuffers* other = NULL);
-		void flush();
-
-		AccumulatorBuffer<CountAccumulator>	 			mCounts;
-		AccumulatorBuffer<SampleAccumulator>			mSamples;
-		AccumulatorBuffer<EventAccumulator>				mEvents;
-		AccumulatorBuffer<TimeBlockAccumulator> 		mStackTimers;
-		AccumulatorBuffer<MemStatAccumulator> 			mMemStats;
-	};
-
 	class Recording 
 	:	public LLStopWatchControlsMixin<Recording>
 	{
@@ -138,10 +124,7 @@ namespace LLTrace
 		Recording& operator = (const Recording& other);
 
 		// accumulate data from subsequent, non-overlapping recording
-		void appendRecording(const Recording& other);
-
-		// gather data from recording, ignoring time relationship (for example, pulling data from slave threads)
-		void mergeRecording(const Recording& other);
+		void appendRecording(Recording& other);
 
 		// grab latest recorded data
 		void update();
@@ -291,7 +274,7 @@ namespace LLTrace
 
 		LLTimer				mSamplingTimer;
 		LLUnit<F64, LLUnits::Seconds>			mElapsedSeconds;
-		LLCopyOnWritePointer<RecordingBuffers>	mBuffers;
+		LLCopyOnWritePointer<AccumulatorBufferGroup>	mBuffers;
 	};
 
 	class LL_COMMON_API PeriodicRecording
diff --git a/indra/llcommon/lltracethreadrecorder.cpp b/indra/llcommon/lltracethreadrecorder.cpp
index 54006f4e5b828d2af27e534bccef60ae8ea10f02..7192564c94898bf856ad319517767e2f932c5d04 100644
--- a/indra/llcommon/lltracethreadrecorder.cpp
+++ b/indra/llcommon/lltracethreadrecorder.cpp
@@ -31,6 +31,7 @@
 namespace LLTrace
 {
 
+MasterThreadRecorder* gUIThreadRecorder = NULL;
 
 ///////////////////////////////////////////////////////////////////////
 // ThreadRecorder
@@ -49,7 +50,7 @@ ThreadRecorder::ThreadRecorder()
 	mNumTimeBlockTreeNodes = AccumulatorBuffer<TimeBlockAccumulator>::getDefaultBuffer()->size();
 	mTimeBlockTreeNodes = new TimeBlockTreeNode[mNumTimeBlockTreeNodes];
 
-	mThreadRecording.start();
+	activate(&mThreadRecordingBuffers);
 
 	// initialize time block parent pointers
 	for (LLInstanceTracker<TimeBlock>::instance_iter it = LLInstanceTracker<TimeBlock>::beginInstances(), end_it = LLInstanceTracker<TimeBlock>::endInstances(); 
@@ -72,6 +73,8 @@ ThreadRecorder::ThreadRecorder()
 
 ThreadRecorder::~ThreadRecorder()
 {
+	deactivate(&mThreadRecordingBuffers);
+
 	delete mRootTimer;
 
 	if (!mActiveRecordings.empty())
@@ -84,7 +87,7 @@ ThreadRecorder::~ThreadRecorder()
 	delete[] mTimeBlockTreeNodes;
 }
 
-TimeBlockTreeNode* ThreadRecorder::getTimeBlockTreeNode(S32 index)
+TimeBlockTreeNode* ThreadRecorder::getTimeBlockTreeNode( S32 index )
 {
 	if (0 <= index && index < mNumTimeBlockTreeNodes)
 	{
@@ -94,23 +97,33 @@ TimeBlockTreeNode* ThreadRecorder::getTimeBlockTreeNode(S32 index)
 }
 
 
-void ThreadRecorder::activate( Recording* recording )
+void ThreadRecorder::activate( AccumulatorBufferGroup* recording )
 {
+	active_recording_list_t::reverse_iterator it, end_it;
+	for (it = mActiveRecordings.rbegin(), end_it = mActiveRecordings.rend();
+		it != end_it;
+		++it)
+	{
+		llassert((*it)->mTargetRecording != recording);
+	}
+
 	ActiveRecording* active_recording = new ActiveRecording(recording);
 	if (!mActiveRecordings.empty())
 	{
-		mActiveRecordings.back()->mPartialRecording.handOffTo(active_recording->mPartialRecording);
+		AccumulatorBufferGroup& prev_active_recording = mActiveRecordings.back()->mPartialRecording;
+		prev_active_recording.sync();
+		prev_active_recording.handOffTo(active_recording->mPartialRecording);
 	}
 	mActiveRecordings.push_back(active_recording);
 
 	mActiveRecordings.back()->mPartialRecording.makePrimary();
 }
 
-ThreadRecorder::active_recording_list_t::reverse_iterator ThreadRecorder::bringUpToDate( Recording* recording )
+ThreadRecorder::active_recording_list_t::reverse_iterator ThreadRecorder::bringUpToDate( AccumulatorBufferGroup* recording )
 {
 	if (mActiveRecordings.empty()) return mActiveRecordings.rend();
 
-	mActiveRecordings.back()->mPartialRecording.flush();
+	mActiveRecordings.back()->mPartialRecording.sync();
 	TimeBlock::updateTimes();
 
 	active_recording_list_t::reverse_iterator it, end_it;
@@ -148,34 +161,38 @@ ThreadRecorder::active_recording_list_t::reverse_iterator ThreadRecorder::bringU
 	return it;
 }
 
-void ThreadRecorder::deactivate( Recording* recording )
+void ThreadRecorder::deactivate( AccumulatorBufferGroup* recording )
 {
 	active_recording_list_t::reverse_iterator it = bringUpToDate(recording);
 	if (it != mActiveRecordings.rend())
 	{
-		// and if we've found the recording we wanted to update
-		active_recording_list_t::reverse_iterator next_it = it;
-		++next_it;
-		if (next_it != mActiveRecordings.rend())
-		{
-			(*next_it)->mPartialRecording.makePrimary();
-		}
-
 		active_recording_list_t::iterator recording_to_remove = (++it).base();
+		bool was_primary = (*recording_to_remove)->mPartialRecording.isPrimary();
 		llassert((*recording_to_remove)->mTargetRecording == recording);
 		delete *recording_to_remove;
 		mActiveRecordings.erase(recording_to_remove);
+		if (was_primary)
+		{
+			if (mActiveRecordings.empty())
+			{
+				AccumulatorBufferGroup::clearPrimary();
+			}
+			else
+			{
+				mActiveRecordings.back()->mPartialRecording.makePrimary();
+			}
+		}
 	}
 }
 
-ThreadRecorder::ActiveRecording::ActiveRecording( Recording* target ) 
+ThreadRecorder::ActiveRecording::ActiveRecording( AccumulatorBufferGroup* target ) 
 :	mTargetRecording(target)
 {
 }
 
 void ThreadRecorder::ActiveRecording::movePartialToTarget()
 {
-	mTargetRecording->mBuffers.write()->append(mPartialRecording);
+	mTargetRecording->append(mPartialRecording);
 	// reset based on self to keep history
 	mPartialRecording.reset(&mPartialRecording);
 }
@@ -197,79 +214,49 @@ SlaveThreadRecorder::~SlaveThreadRecorder()
 }
 
 void SlaveThreadRecorder::pushToMaster()
-{
-	mThreadRecording.stop();
-	{
-		LLMutexLock(mMasterRecorder.getSlaveListMutex());
-		mSharedData.appendFrom(mThreadRecording);
+{ 
+	{ LLMutexLock lock(&mSharedRecordingMutex);	
+		LLTrace::get_thread_recorder()->bringUpToDate(&mThreadRecordingBuffers);
+		mSharedRecordingBuffers.append(mThreadRecordingBuffers);
 	}
-	mThreadRecording.start();
-}
-
-void SlaveThreadRecorder::SharedData::appendFrom( const Recording& source )
-{
-	LLMutexLock lock(&mRecordingMutex);
-	appendRecording(source);
 }
 
-void SlaveThreadRecorder::SharedData::appendTo( Recording& sink )
-{
-	LLMutexLock lock(&mRecordingMutex);
-	sink.appendRecording(*this);
-}
-
-void SlaveThreadRecorder::SharedData::mergeFrom( const RecordingBuffers& source )
-{
-	LLMutexLock lock(&mRecordingMutex);
-	mBuffers.write()->merge(source);
-}
-
-void SlaveThreadRecorder::SharedData::mergeTo( RecordingBuffers& sink )
-{
-	LLMutexLock lock(&mRecordingMutex);
-	sink.merge(*mBuffers);
-}
-
-void SlaveThreadRecorder::SharedData::reset()
-{
-	LLMutexLock lock(&mRecordingMutex);
-	Recording::reset();
-}
-
-
 ///////////////////////////////////////////////////////////////////////
 // MasterThreadRecorder
 ///////////////////////////////////////////////////////////////////////
 
 static LLFastTimer::DeclareTimer FTM_PULL_TRACE_DATA_FROM_SLAVES("Pull slave trace data");
+
 void MasterThreadRecorder::pullFromSlaveThreads()
 {
-	LLFastTimer _(FTM_PULL_TRACE_DATA_FROM_SLAVES);
+	/*LLFastTimer _(FTM_PULL_TRACE_DATA_FROM_SLAVES);
 	if (mActiveRecordings.empty()) return;
 
-	LLMutexLock lock(&mSlaveListMutex);
+	{ LLMutexLock lock(&mSlaveListMutex);
 
-	RecordingBuffers& target_recording_buffers = mActiveRecordings.back()->mPartialRecording;
+	AccumulatorBufferGroup& target_recording_buffers = mActiveRecordings.back()->mPartialRecording;
+	target_recording_buffers.sync();
 	for (slave_thread_recorder_list_t::iterator it = mSlaveThreadRecorders.begin(), end_it = mSlaveThreadRecorders.end();
-		it != end_it;
-		++it)
-	{
-		// ignore block timing info for now
-		(*it)->mSharedData.mergeTo(target_recording_buffers);
-		(*it)->mSharedData.reset();
+	it != end_it;
+	++it)
+	{ LLMutexLock lock(&(*it)->mSharedRecordingMutex);
+
+	target_recording_buffers.merge((*it)->mSharedRecordingBuffers);
+	(*it)->mSharedRecordingBuffers.reset();
 	}
+	}*/
 }
 
+// called by slave thread
 void MasterThreadRecorder::addSlaveThread( class SlaveThreadRecorder* child )
-{
-	LLMutexLock lock(&mSlaveListMutex);
+{ LLMutexLock lock(&mSlaveListMutex);
 
 	mSlaveThreadRecorders.push_back(child);
 }
 
+// called by slave thread
 void MasterThreadRecorder::removeSlaveThread( class SlaveThreadRecorder* child )
-{
-	LLMutexLock lock(&mSlaveListMutex);
+{ LLMutexLock lock(&mSlaveListMutex);
 
 	for (slave_thread_recorder_list_t::iterator it = mSlaveThreadRecorders.begin(), end_it = mSlaveThreadRecorders.end();
 		it != end_it;
@@ -289,4 +276,28 @@ void MasterThreadRecorder::pushToMaster()
 MasterThreadRecorder::MasterThreadRecorder()
 {}
 
+
+MasterThreadRecorder& getUIThreadRecorder()
+{
+	llassert(gUIThreadRecorder != NULL);
+	return *gUIThreadRecorder;
+}
+
+LLThreadLocalPointer<ThreadRecorder>& get_thread_recorder_ptr()
+{
+	static LLThreadLocalPointer<ThreadRecorder> s_thread_recorder;
+	return s_thread_recorder;
+}
+
+const LLThreadLocalPointer<ThreadRecorder>& get_thread_recorder()
+{
+	return get_thread_recorder_ptr();
+}
+
+void set_thread_recorder(ThreadRecorder* recorder)
+{
+	get_thread_recorder_ptr() = recorder;
+}
+
+
 }
diff --git a/indra/llcommon/lltracethreadrecorder.h b/indra/llcommon/lltracethreadrecorder.h
index bf3701304fd63af7a668f1c3d226241ed73f4a8b..6b7a8e5865e5fb689bbaed262b28639444bf201c 100644
--- a/indra/llcommon/lltracethreadrecorder.h
+++ b/indra/llcommon/lltracethreadrecorder.h
@@ -31,7 +31,8 @@
 #include "llpreprocessor.h"
 
 #include "llmutex.h"
-#include "lltracerecording.h"
+#include "lltraceaccumulators.h"
+#include "llthreadlocalstorage.h"
 
 namespace LLTrace
 {
@@ -45,9 +46,9 @@ namespace LLTrace
 
 		virtual ~ThreadRecorder();
 
-		void activate(Recording* recording);
-		void deactivate(Recording* recording);
-		active_recording_list_t::reverse_iterator bringUpToDate(Recording* recording);
+		void activate(AccumulatorBufferGroup* recording);
+		void deactivate(AccumulatorBufferGroup* recording);
+		active_recording_list_t::reverse_iterator bringUpToDate(AccumulatorBufferGroup* recording);
 
 		virtual void pushToMaster() = 0;
 
@@ -56,20 +57,21 @@ namespace LLTrace
 	protected:
 		struct ActiveRecording
 		{
-			ActiveRecording(Recording* target);
+			ActiveRecording(AccumulatorBufferGroup* target);
 
-			Recording*			mTargetRecording;
-			RecordingBuffers	mPartialRecording;
+			AccumulatorBufferGroup*	mTargetRecording;
+			AccumulatorBufferGroup	mPartialRecording;
 
 			void movePartialToTarget();
 		};
-		Recording					mThreadRecording;
+		AccumulatorBufferGroup			mThreadRecordingBuffers;
 
 		active_recording_list_t		mActiveRecordings;
 
 		class BlockTimer*			mRootTimer;
 		TimeBlockTreeNode*			mTimeBlockTreeNodes;
 		size_t						mNumTimeBlockTreeNodes;
+		BlockTimerStackRecord		mBlockTimerStackRecord;
 	};
 
 	class LL_COMMON_API MasterThreadRecorder : public ThreadRecorder
@@ -85,9 +87,6 @@ namespace LLTrace
 		// call this periodically to gather stats data from slave threads
 		void pullFromSlaveThreads();
 
-		LLMutex* getSlaveListMutex() { return &mSlaveListMutex; }
-
-
 	private:
 
 		typedef std::list<class SlaveThreadRecorder*> slave_thread_recorder_list_t;
@@ -105,22 +104,20 @@ namespace LLTrace
 		// call this periodically to gather stats data for master thread to consume
 		/*virtual*/ void pushToMaster();
 
-		MasterThreadRecorder* 	mMaster;
-
-		class SharedData : public Recording
-		{
-		public:
-			void appendFrom(const Recording& source);
-			void appendTo(Recording& sink);
-			void mergeFrom(const RecordingBuffers& source);
-			void mergeTo(RecordingBuffers& sink);
-			void reset();
-		private:
-			LLMutex		mRecordingMutex;
-		};
-		SharedData				mSharedData;
+	private:
+		friend class MasterThreadRecorder;
+		LLMutex					mSharedRecordingMutex;
+		AccumulatorBufferGroup	mSharedRecordingBuffers;
 		MasterThreadRecorder&	mMasterRecorder;
 	};
+
+	//FIXME: let user code set up thread recorder topology
+	extern MasterThreadRecorder* gUIThreadRecorder ;
+
+	const LLThreadLocalPointer<class ThreadRecorder>& get_thread_recorder();
+	void set_thread_recorder(class ThreadRecorder*);
+	class MasterThreadRecorder& getUIThreadRecorder();
+
 }
 
 #endif // LL_LLTRACETHREADRECORDER_H
diff --git a/indra/newview/llappviewer.cpp b/indra/newview/llappviewer.cpp
index 733c9cc9df83b6ba770c549e0cfe1a327f348fff..7c5cd520dad187b9dfd41bcb6d4878eeadb722af 100755
--- a/indra/newview/llappviewer.cpp
+++ b/indra/newview/llappviewer.cpp
@@ -1294,6 +1294,8 @@ bool LLAppViewer::mainLoop()
 	{
 		LLFastTimer _(FTM_FRAME);
 		LLTrace::TimeBlock::processTimes();
+		llassert((LLTrace::get_frame_recording().getCurRecording().update(), 
+				LLTrace::get_frame_recording().getCurRecording().getSampleCount(LLStatViewer::FPS) <= 1));
 		LLTrace::get_frame_recording().nextPeriod();
 		LLTrace::TimeBlock::logStats();
 
@@ -5617,6 +5619,6 @@ void LLAppViewer::metricsSend(bool enable_reporting)
 	// Reset even if we can't report.  Rather than gather up a huge chunk of
 	// data, we'll keep to our sampling interval and retain the data
 	// resolution in time.
-	gViewerAssetStats->reset();
+	gViewerAssetStats->restart();
 }
 
diff --git a/indra/newview/llscenemonitor.cpp b/indra/newview/llscenemonitor.cpp
index ed9eeb93305230fc7a84b630004b05880290872c..a4d693ec0b6907db25f2263021e63c40237f6233 100644
--- a/indra/newview/llscenemonitor.cpp
+++ b/indra/newview/llscenemonitor.cpp
@@ -260,14 +260,7 @@ void LLSceneMonitor::capture()
 	static LLCachedControl<bool> monitor_enabled(gSavedSettings, "SceneLoadingMonitorEnabled");
 	static LLCachedControl<F32>  scene_load_sample_time(gSavedSettings, "SceneLoadingMonitorSampleTime");
 	static LLFrameTimer timer;	
-
-	if (mEnabled 
-		&&	(mMonitorRecording.getSum(*LLViewerCamera::getVelocityStat()) > 0.1f
-			|| mMonitorRecording.getSum(*LLViewerCamera::getAngularVelocityStat()) > 0.05f))
-	{
-		reset();
-		freezeScene();
-	}
+	static bool force_capture = true;
 
 	bool enabled = monitor_enabled || mDebugViewerVisible;
 	if(mEnabled != enabled)
@@ -275,6 +268,7 @@ void LLSceneMonitor::capture()
 		if(mEnabled)
 		{
 			unfreezeScene();
+			force_capture = true;
 		}
 		else
 		{
@@ -285,11 +279,23 @@ void LLSceneMonitor::capture()
 		mEnabled = enabled;
 	}
 
-	if(timer.getElapsedTimeF32() > scene_load_sample_time()
+	if (mEnabled 
+		&&	(mMonitorRecording.getSum(*LLViewerCamera::getVelocityStat()) > 0.1f
+		|| mMonitorRecording.getSum(*LLViewerCamera::getAngularVelocityStat()) > 0.05f))
+	{
+		reset();
+		freezeScene();
+		force_capture = true;
+	}
+
+	if((timer.getElapsedTimeF32() > scene_load_sample_time() 
+			|| force_capture)
 		&& mEnabled
 		&& LLGLSLShader::sNoFixedFunction
 		&& last_capture_time != gFrameCount)
 	{
+		force_capture = false;
+
 		mSceneLoadRecording.resume();
 		mMonitorRecording.resume();
 
@@ -479,12 +485,10 @@ void LLSceneMonitor::fetchQueryResult()
 			if(mDiffResult > diff_threshold())
 			{
 				mSceneLoadRecording.extend();
-				llassert(mSceneLoadRecording.getAcceptedRecording().getLastRecording().getSum(LLStatViewer::FPS));
 			}
 			else
 			{
 				mSceneLoadRecording.getPotentialRecording().nextPeriod();
-				llassert(mSceneLoadRecording.getPotentialRecording().getLastRecording().getSum(LLStatViewer::FPS));
 			}
 		}
 	}
diff --git a/indra/newview/llstartup.cpp b/indra/newview/llstartup.cpp
index de8d549055214039a63aac942f307a612e4740b9..097ea7cc8dc5f9f67f713eb5d9d710f984c8656e 100755
--- a/indra/newview/llstartup.cpp
+++ b/indra/newview/llstartup.cpp
@@ -2054,6 +2054,7 @@ bool idle_startup()
 		const F32 wearables_time = wearables_timer.getElapsedTimeF32();
 		static LLCachedControl<F32> max_wearables_time(gSavedSettings, "ClothingLoadingDelay");
 
+		display_startup();
 		if (!gAgent.isGenderChosen() && isAgentAvatarValid())
 		{
 			// No point in waiting for clothing, we don't even
@@ -2067,50 +2068,39 @@ bool idle_startup()
 			LLNotificationsUtil::add("WelcomeChooseSex", LLSD(), LLSD(),
 				callback_choose_gender);
 			LLStartUp::setStartupState( STATE_CLEANUP );
-			return TRUE;
 		}
-		
-		display_startup();
-
-		if (wearables_time > max_wearables_time())
+		else if (wearables_time >= max_wearables_time())
 		{
 			LLNotificationsUtil::add("ClothingLoading");
 			record(LLStatViewer::LOADING_WEARABLES_LONG_DELAY, wearables_time);
 			LLStartUp::setStartupState( STATE_CLEANUP );
-			return TRUE;
 		}
-
-		if (gAgent.isFirstLogin())
+		else if (gAgent.isFirstLogin()
+				&& isAgentAvatarValid()
+				&& gAgentAvatarp->isFullyLoaded())
 		{
 			// wait for avatar to be completely loaded
-			if (isAgentAvatarValid()
-				&& gAgentAvatarp->isFullyLoaded())
-			{
-				//llinfos << "avatar fully loaded" << llendl;
-				LLStartUp::setStartupState( STATE_CLEANUP );
-				return TRUE;
-			}
+			//llinfos << "avatar fully loaded" << llendl;
+			LLStartUp::setStartupState( STATE_CLEANUP );
+		}
+		// OK to just get the wearables
+		else if (!gAgent.isFirstLogin() && gAgentWearables.areWearablesLoaded() )
+		{
+			// We have our clothing, proceed.
+			//llinfos << "wearables loaded" << llendl;
+			LLStartUp::setStartupState( STATE_CLEANUP );
 		}
 		else
 		{
-			// OK to just get the wearables
-			if ( gAgentWearables.areWearablesLoaded() )
-			{
-				// We have our clothing, proceed.
-				//llinfos << "wearables loaded" << llendl;
-				LLStartUp::setStartupState( STATE_CLEANUP );
-				return TRUE;
-			}
+			display_startup();
+			update_texture_fetch();
+			display_startup();
+			set_startup_status(0.9f + 0.1f * wearables_time / max_wearables_time(),
+				LLTrans::getString("LoginDownloadingClothing").c_str(),
+				gAgent.mMOTD.c_str());
+			display_startup();
 		}
-
-		display_startup();
-		update_texture_fetch();
-		display_startup();
-		set_startup_status(0.9f + 0.1f * wearables_time / max_wearables_time(),
-						 LLTrans::getString("LoginDownloadingClothing").c_str(),
-						 gAgent.mMOTD.c_str());
-		display_startup();
-		return TRUE;
+		//fall through this frame to STATE_CLEANUP
 	}
 
 	if (STATE_CLEANUP == LLStartUp::getStartupState())
diff --git a/indra/newview/llviewerassetstats.cpp b/indra/newview/llviewerassetstats.cpp
index 6ab2aefc34efbf22f7d79ef0ed4ef1d67dc9b177..80412c215f785716e6cdf21bfcfe5e77742ac152 100755
--- a/indra/newview/llviewerassetstats.cpp
+++ b/indra/newview/llviewerassetstats.cpp
@@ -314,9 +314,9 @@ void LLViewerAssetStats::handleStop()
 }
 
 void LLViewerAssetStats::handleReset()
-	{
+{
 	reset();
-	}
+}
 
 
 void LLViewerAssetStats::reset()
@@ -328,6 +328,7 @@ void LLViewerAssetStats::reset()
 	if (mRegionHandle)
 	{
 		mCurRecording = &mRegionRecordings[mRegionHandle];
+		mCurRecording->setPlayState(getPlayState());
 	}
 }
 
@@ -346,7 +347,7 @@ void LLViewerAssetStats::setRegion(region_handle_t region_handle)
 	if (region_handle)
 	{
 		mCurRecording = &mRegionRecordings[region_handle];
-		mCurRecording->start();
+		mCurRecording->setPlayState(getPlayState());
 	}
 
 	mRegionHandle = region_handle;
@@ -493,19 +494,19 @@ void LLViewerAssetStats::getStats(AssetStats& stats, bool compact_output)
 }
 
 LLSD LLViewerAssetStats::asLLSD(bool compact_output)
-		{
+{
 	LLParamSDParser parser;
 	LLSD sd;
 	AssetStats stats;
 	getStats(stats, compact_output);
 	LLInitParam::predicate_rule_t rule = LLInitParam::default_parse_rules();
 	if (!compact_output)
-		{
+	{
 		rule.allow(LLInitParam::EMPTY);
-		}
+	}
 	parser.writeSD(sd, stats, rule);
 	return sd;
-	}
+}
 
 // ------------------------------------------------------
 // Global free-function definitions (LLViewerAssetStatsFF namespace)
diff --git a/indra/newview/llviewercamera.cpp b/indra/newview/llviewercamera.cpp
index ebc4f09edbbcfbcf1634d81c621e43445752984f..57a0195d23ea41ed3074e0bbb4595422460b5146 100755
--- a/indra/newview/llviewercamera.cpp
+++ b/indra/newview/llviewercamera.cpp
@@ -155,7 +155,7 @@ void LLViewerCamera::updateCameraLocation(const LLVector3 &center,
 
 	setOriginAndLookAt(origin, up_direction, point_of_interest);
 
-	mVelocityDir = center - last_position ; 
+	mVelocityDir = origin - last_position ; 
 	F32 dpos = mVelocityDir.normVec() ;
 	LLQuaternion rotation;
 	rotation.shortestArc(last_axis, getAtAxis());
diff --git a/indra/newview/llviewerdisplay.cpp b/indra/newview/llviewerdisplay.cpp
index 1de849374967128859211764124f8982e52b4bbc..ee5793fe6a9a218a53943e382c73325e8d5009b2 100755
--- a/indra/newview/llviewerdisplay.cpp
+++ b/indra/newview/llviewerdisplay.cpp
@@ -106,6 +106,7 @@ LLFrameTimer gRecentMemoryTime;
 void pre_show_depth_buffer();
 void post_show_depth_buffer();
 void render_ui(F32 zoom_factor = 1.f, int subfield = 0);
+void swap();
 void render_hud_attachments();
 void render_ui_3d();
 void render_ui_2d();
@@ -344,7 +345,7 @@ void display(BOOL rebuild, F32 zoom_factor, int subfield, BOOL for_snapshot)
 	// Bail out if we're in the startup state and don't want to try to
 	// render the world.
 	//
-	if (LLStartUp::getStartupState() < STATE_STARTED)
+	if (LLStartUp::getStartupState() < STATE_WEARABLES_WAIT)
 	{
 		LLAppViewer::instance()->pingMainloopTimeout("Display:Startup");
 		display_startup();
@@ -553,6 +554,7 @@ void display(BOOL rebuild, F32 zoom_factor, int subfield, BOOL for_snapshot)
 	{
 		LLAppViewer::instance()->pingMainloopTimeout("Display:Disconnected");
 		render_ui();
+		swap();
 	}
 	
 	//////////////////////////
@@ -1021,6 +1023,7 @@ void display(BOOL rebuild, F32 zoom_factor, int subfield, BOOL for_snapshot)
 		{
 			LLFastTimer t(FTM_RENDER_UI);
 			render_ui();
+			swap();
 		}
 
 		
@@ -1244,8 +1247,6 @@ BOOL setup_hud_matrices(const LLRect& screen_region)
 	return TRUE;
 }
 
-static LLFastTimer::DeclareTimer FTM_SWAP("Swap");
-
 void render_ui(F32 zoom_factor, int subfield)
 {
 	LLGLState::checkStates();
@@ -1322,10 +1323,16 @@ void render_ui(F32 zoom_factor, int subfield)
 		glh_set_current_modelview(saved_view);
 		gGL.popMatrix();
 	}
+}
+
+static LLFastTimer::DeclareTimer FTM_SWAP("Swap");
+
+void swap()
+{
+	LLFastTimer t(FTM_SWAP);
 
 	if (gDisplaySwapBuffers)
 	{
-		LLFastTimer t(FTM_SWAP);
 		gViewerWindow->getWindow()->swapBuffers();
 	}
 	gDisplaySwapBuffers = TRUE;