diff --git a/indra/llcommon/llfasttimer.cpp b/indra/llcommon/llfasttimer.cpp
index 5dc5fdd5be2aee215fae2d1ae7cdb18411995fcd..3fdd33959d62e20ec4c8d8dfb0672f2c9985729a 100644
--- a/indra/llcommon/llfasttimer.cpp
+++ b/indra/llcommon/llfasttimer.cpp
@@ -180,6 +180,7 @@ TimeBlockTreeNode& TimeBlock::getTreeNode() const
 
 static LLFastTimer::DeclareTimer FTM_PROCESS_TIMES("Process FastTimer Times");
 
+// not thread safe, so only call on main thread
 //static
 void TimeBlock::processTimes()
 {
@@ -195,8 +196,8 @@ void TimeBlock::processTimes()
 		TimeBlock& timer = *it;
 		if (&timer == &TimeBlock::getRootTimeBlock()) continue;
 			
-			// bootstrap tree construction by attaching to last timer to be on stack
-			// when this timer was called
+		// bootstrap tree construction by attaching to last timer to be on stack
+		// when this timer was called
 		if (timer.getParent() == &TimeBlock::getRootTimeBlock())
 		{
 			TimeBlockAccumulator* accumulator = timer.getPrimaryAccumulator();
@@ -233,30 +234,30 @@ void TimeBlock::processTimes()
 			TimeBlockAccumulator* accumulator = timerp->getPrimaryAccumulator();
 
 			if (accumulator->mMoveUpTree)
-		{
+			{
 				// since ancestors have already been visited, re-parenting won't affect tree traversal
-			//step up tree, bringing our descendants with us
-			LL_DEBUGS("FastTimers") << "Moving " << timerp->getName() << " from child of " << timerp->getParent()->getName() <<
-				" to child of " << timerp->getParent()->getParent()->getName() << LL_ENDL;
-			timerp->setParent(timerp->getParent()->getParent());
-				accumulator->mParent = timerp->getParent();
-				accumulator->mMoveUpTree = false;
-
-			// don't bubble up any ancestors until descendants are done bubbling up
-				// as ancestors may call this timer only on certain paths, so we want to resolve
-				// child-most block locations before their parents
-			it.skipAncestors();
+				//step up tree, bringing our descendants with us
+				LL_DEBUGS("FastTimers") << "Moving " << timerp->getName() << " from child of " << timerp->getParent()->getName() <<
+					" to child of " << timerp->getParent()->getParent()->getName() << LL_ENDL;
+				timerp->setParent(timerp->getParent()->getParent());
+					accumulator->mParent = timerp->getParent();
+					accumulator->mMoveUpTree = false;
+
+				// don't bubble up any ancestors until descendants are done bubbling up
+					// as ancestors may call this timer only on certain paths, so we want to resolve
+					// child-most block locations before their parents
+				it.skipAncestors();
+			}
 		}
 	}
-}
 
 	// walk up stack of active timers and accumulate current time while leaving timing structures active
 	BlockTimerStackRecord* stack_record			= ThreadTimerStack::getInstance();
 	BlockTimer* cur_timer						= stack_record->mActiveTimer;
 	TimeBlockAccumulator* accumulator = stack_record->mTimeBlock->getPrimaryAccumulator();
 
-	// root defined by parent pointing to self
-	while(cur_timer && cur_timer->mParentTimerData.mActiveTimer != cur_timer)
+	while(cur_timer 
+		&& cur_timer->mParentTimerData.mActiveTimer != cur_timer) // root defined by parent pointing to self
 	{
 		U64 cumulative_time_delta = cur_time - cur_timer->mStartTime;
 		accumulator->mTotalTimeCounter += cumulative_time_delta - (accumulator->mTotalTimeCounter - cur_timer->mBlockStartTotalTimeCounter);
@@ -413,8 +414,11 @@ TimeBlockAccumulator::TimeBlockAccumulator()
 	mParent(NULL)
 {}
 
-void TimeBlockAccumulator::addSamples( const TimeBlockAccumulator& other )
+void TimeBlockAccumulator::addSamples( const TimeBlockAccumulator& other, bool append )
 {
+	// we can't merge two unrelated time block samples, as that will screw with the nested timings
+	// due to the call hierarchy of each thread
+	llassert(append);
 	mTotalTimeCounter += other.mTotalTimeCounter - other.mStartTotalTimeCounter;
 	mSelfTimeCounter += other.mSelfTimeCounter;
 	mCalls += other.mCalls;
diff --git a/indra/llcommon/lltrace.h b/indra/llcommon/lltrace.h
index 00bab536ff9194c6fa90f383c808038902a50340..6dfe9e4b4ec91a161017a4a7ef5f414c6d0780c9 100644
--- a/indra/llcommon/lltrace.h
+++ b/indra/llcommon/lltrace.h
@@ -121,12 +121,12 @@ class AccumulatorBuffer : public LLRefCount
 		return mStorage[index]; 
 	}
 
-	void addSamples(const AccumulatorBuffer<ACCUMULATOR>& other)
+	void addSamples(const AccumulatorBuffer<ACCUMULATOR>& other, bool append = true)
 	{
 		llassert(mStorageSize >= sNextStorageSlot && other.mStorageSize > sNextStorageSlot);
 		for (size_t i = 0; i < sNextStorageSlot; i++)
 		{
-			mStorage[i].addSamples(other.mStorage[i]);
+			mStorage[i].addSamples(other.mStorage[i], append);
 		}
 	}
 
@@ -310,7 +310,7 @@ class EventAccumulator
 		mLastValue = value;
 	}
 
-	void addSamples(const self_t& other)
+	void addSamples(const self_t& other, bool append)
 	{
 		if (other.mNumSamples)
 		{
@@ -350,7 +350,7 @@ class EventAccumulator
 			F64 weight = (F64)mNumSamples / (F64)(mNumSamples + other.mNumSamples);
 			mNumSamples += other.mNumSamples;
 			mMean = mMean * weight + other.mMean * (1.f - weight);
-			mLastValue = other.mLastValue;
+			if (append) mLastValue = other.mLastValue;
 		}
 	}
 
@@ -434,7 +434,7 @@ class SampleAccumulator
 		mHasValue = true;
 	}
 
-	void addSamples(const self_t& other)
+	void addSamples(const self_t& other, bool append)
 	{
 		if (other.mTotalSamplingTime)
 		{
@@ -476,9 +476,12 @@ class SampleAccumulator
 			mNumSamples += other.mNumSamples;
 			mTotalSamplingTime += other.mTotalSamplingTime;
 			mMean = (mMean * weight) + (other.mMean * (1.0 - weight));
-			mLastValue = other.mLastValue;
-			mLastSampleTimeStamp = other.mLastSampleTimeStamp;
-			mHasValue |= other.mHasValue;
+			if (append)
+			{
+				mLastValue = other.mLastValue;
+				mLastSampleTimeStamp = other.mLastSampleTimeStamp;
+				mHasValue |= other.mHasValue;
+			}
 		}
 	}
 
@@ -551,7 +554,7 @@ class CountAccumulator
 		mSum += value;
 	}
 
-	void addSamples(const CountAccumulator<T>& other)
+	void addSamples(const CountAccumulator<T>& other, bool /*append*/)
 	{
 		mSum += other.mSum;
 		mNumSamples += other.mNumSamples;
@@ -596,7 +599,7 @@ class TimeBlockAccumulator
 	};
 
 	TimeBlockAccumulator();
-	void addSamples(const self_t& other);
+	void addSamples(const self_t& other, bool /*append*/);
 	void reset(const self_t* other);
 	void flush() {}
 
@@ -716,6 +719,8 @@ void add(CountStatHandle<T>& count, VALUE_T value)
 
 struct MemStatAccumulator
 {
+	typedef MemStatAccumulator self_t;
+
 	MemStatAccumulator()
 	:	mSize(0),
 		mChildSize(0),
@@ -723,7 +728,7 @@ struct MemStatAccumulator
 		mDeallocatedCount(0)
 	{}
 
-	void addSamples(const MemStatAccumulator& other)
+	void addSamples(const MemStatAccumulator& other, bool /*append*/)
 	{
 		mSize += other.mSize;
 		mChildSize += other.mChildSize;
diff --git a/indra/llcommon/lltracerecording.cpp b/indra/llcommon/lltracerecording.cpp
index 86cdca3e10d1a4cf1cbd463b5f54164c0968b797..3994e4f5214d87a859051d9348592a7e60c6e4b2 100644
--- a/indra/llcommon/lltracerecording.cpp
+++ b/indra/llcommon/lltracerecording.cpp
@@ -97,13 +97,15 @@ void RecordingBuffers::append( const RecordingBuffers& other )
 
 void RecordingBuffers::merge( const RecordingBuffers& other)
 {
-	mCountsFloat.addSamples(other.mCountsFloat);
-	mCounts.addSamples(other.mCounts);
-	mSamplesFloat.addSamples(other.mSamplesFloat);
-	mSamples.addSamples(other.mSamples);
-	mEventsFloat.addSamples(other.mEventsFloat);
-	mEvents.addSamples(other.mEvents);
-	mMemStats.addSamples(other.mMemStats);
+	mCountsFloat.addSamples(other.mCountsFloat, false);
+	mCounts.addSamples(other.mCounts, false);
+	mSamplesFloat.addSamples(other.mSamplesFloat, false);
+	mSamples.addSamples(other.mSamples, false);
+	mEventsFloat.addSamples(other.mEventsFloat, false);
+	mEvents.addSamples(other.mEvents, false);
+	mMemStats.addSamples(other.mMemStats, false);
+	// for now, hold out timers from merge, need to be displayed per thread
+	//mStackTimers.addSamples(other.mStackTimers, false);
 }
 
 void RecordingBuffers::reset(RecordingBuffers* other)
@@ -190,7 +192,6 @@ void Recording::handleStop()
 {
 	mElapsedSeconds += mSamplingTimer.getElapsedTimeF64();
 	mBuffers.write()->flush();
-	LLTrace::TimeBlock::processTimes();
 	LLTrace::get_thread_recorder()->deactivate(this);
 }
 
diff --git a/indra/llcommon/lltracethreadrecorder.cpp b/indra/llcommon/lltracethreadrecorder.cpp
index 89b5df1f942d865efd20f755cfe9ec0a747ff611..75c7cb2ff1772487b4b50a97087a55bea98c3fea 100644
--- a/indra/llcommon/lltracethreadrecorder.cpp
+++ b/indra/llcommon/lltracethreadrecorder.cpp
@@ -202,21 +202,14 @@ SlaveThreadRecorder::~SlaveThreadRecorder()
 	mMasterRecorder.removeSlaveThread(this);
 }
 
-bool SlaveThreadRecorder::pushToMaster()
+void SlaveThreadRecorder::pushToMaster()
 {
-	if (mPushCount != mMasterRecorder.getPullCount())
+	mThreadRecording.stop();
 	{
-		mThreadRecording.stop();
-		{
-			LLMutexLock(mMasterRecorder.getSlaveListMutex());
-			mSharedData.appendFrom(mThreadRecording);
-		}
-		mThreadRecording.start();
-
-		mPushCount = mMasterRecorder.getPullCount();
-		return true;
+		LLMutexLock(mMasterRecorder.getSlaveListMutex());
+		mSharedData.appendFrom(mThreadRecording);
 	}
-	return false;
+	mThreadRecording.start();
 }
 
 void SlaveThreadRecorder::SharedData::appendFrom( const Recording& source )
@@ -271,8 +264,6 @@ void MasterThreadRecorder::pullFromSlaveThreads()
 		(*it)->mSharedData.mergeTo(target_recording_buffers);
 		(*it)->mSharedData.reset();
 	}
-
-	mPullCount++;
 }
 
 void MasterThreadRecorder::addSlaveThread( class SlaveThreadRecorder* child )
@@ -298,10 +289,8 @@ void MasterThreadRecorder::removeSlaveThread( class SlaveThreadRecorder* child )
 	}
 }
 
-bool MasterThreadRecorder::pushToMaster()
-{
-	return false;
-}
+void MasterThreadRecorder::pushToMaster()
+{}
 
 MasterThreadRecorder::MasterThreadRecorder()
 {}
diff --git a/indra/llcommon/lltracethreadrecorder.h b/indra/llcommon/lltracethreadrecorder.h
index a044757e62592ab4610646bfa5e4416d9f09ceb1..bf3701304fd63af7a668f1c3d226241ed73f4a8b 100644
--- a/indra/llcommon/lltracethreadrecorder.h
+++ b/indra/llcommon/lltracethreadrecorder.h
@@ -49,7 +49,7 @@ namespace LLTrace
 		void deactivate(Recording* recording);
 		active_recording_list_t::reverse_iterator bringUpToDate(Recording* recording);
 
-		virtual bool pushToMaster() = 0;
+		virtual void pushToMaster() = 0;
 
 		TimeBlockTreeNode* getTimeBlockTreeNode(S32 index);
 
@@ -80,14 +80,13 @@ namespace LLTrace
 		void addSlaveThread(class SlaveThreadRecorder* child);
 		void removeSlaveThread(class SlaveThreadRecorder* child);
 
-		/*virtual */ bool pushToMaster();
+		/*virtual */ void pushToMaster();
 
 		// call this periodically to gather stats data from slave threads
 		void pullFromSlaveThreads();
 
 		LLMutex* getSlaveListMutex() { return &mSlaveListMutex; }
 
-		U32	getPullCount() { return mPullCount; }
 
 	private:
 
@@ -95,7 +94,6 @@ namespace LLTrace
 
 		slave_thread_recorder_list_t	mSlaveThreadRecorders;	// list of slave thread recorders associated with this master
 		LLMutex							mSlaveListMutex;		// protects access to slave list
-		LLAtomicU32						mPullCount;				// number of times data has been pulled from slaves
 	};
 
 	class LL_COMMON_API SlaveThreadRecorder : public ThreadRecorder
@@ -105,7 +103,7 @@ namespace LLTrace
 		~SlaveThreadRecorder();
 
 		// call this periodically to gather stats data for master thread to consume
-		/*virtual*/ bool pushToMaster();
+		/*virtual*/ void pushToMaster();
 
 		MasterThreadRecorder* 	mMaster;
 
@@ -122,7 +120,6 @@ namespace LLTrace
 		};
 		SharedData				mSharedData;
 		MasterThreadRecorder&	mMasterRecorder;
-		U32						mPushCount;
 	};
 }
 
diff --git a/indra/newview/llappviewer.cpp b/indra/newview/llappviewer.cpp
index a049aabe7493856097d170969d6d881dab5a3971..ff481d6278abf443cb4121522175eb9472e4fcbd 100644
--- a/indra/newview/llappviewer.cpp
+++ b/indra/newview/llappviewer.cpp
@@ -1241,6 +1241,7 @@ bool LLAppViewer::mainLoop()
 	while (!LLApp::isExiting())
 	{
 		LLFastTimer _(FTM_FRAME);
+		LLTrace::TimeBlock::processTimes();
 		LLTrace::get_frame_recording().nextPeriod();
 		LLTrace::TimeBlock::logStats();