diff --git a/indra/llcommon/llfasttimer.cpp b/indra/llcommon/llfasttimer.cpp
index 23e27622bfcd8671a42cb0803adba5bf4bd1c758..7a7f1c79c15e889050b74b0ed6c48f0b45de5f12 100755
--- a/indra/llcommon/llfasttimer.cpp
+++ b/indra/llcommon/llfasttimer.cpp
@@ -247,17 +247,18 @@ void TimeBlock::incrementalUpdateTimerTree()
 
 
 void TimeBlock::updateTimes()
-	{
-	U64 cur_time = getCPUClockCount64();
-
+{
 	// walk up stack of active timers and accumulate current time while leaving timing structures active
-	BlockTimerStackRecord* stack_record	= ThreadTimerStack::getInstance();
+	BlockTimerStackRecord* stack_record	= LLThreadLocalSingletonPointer<BlockTimerStackRecord>::getInstance();
+	if (stack_record) return;
+
+	U64 cur_time = getCPUClockCount64();
 	BlockTimer* cur_timer				= stack_record->mActiveTimer;
 	TimeBlockAccumulator* accumulator	= stack_record->mTimeBlock->getPrimaryAccumulator();
 
 	while(cur_timer 
 		&& cur_timer->mParentTimerData.mActiveTimer != cur_timer) // root defined by parent pointing to self
-		{
+	{
 		U64 cumulative_time_delta = cur_time - cur_timer->mStartTime;
 		accumulator->mTotalTimeCounter += cumulative_time_delta 
 			- (accumulator->mTotalTimeCounter 
diff --git a/indra/llcommon/llfasttimer.h b/indra/llcommon/llfasttimer.h
index ab8612a8ade8b0e963fa1f7d4c47ca6d59eaf8ca..73c40749edb05f4cc6a39893f04055cf04b83eec 100755
--- a/indra/llcommon/llfasttimer.h
+++ b/indra/llcommon/llfasttimer.h
@@ -38,22 +38,6 @@ class LLMutex;
 namespace LLTrace
 {
 
-class ThreadTimerStack 
-:	public BlockTimerStackRecord, 
-	public LLThreadLocalSingleton<ThreadTimerStack>
-{
-	friend class LLThreadLocalSingleton<ThreadTimerStack>;
-	ThreadTimerStack() 
-	{}
-
-public:
-	ThreadTimerStack& operator=(const BlockTimerStackRecord& other)
-	{
-		BlockTimerStackRecord::operator=(other);
-		return *this;
-	}
-};
-
 class BlockTimer
 {
 public:
@@ -271,7 +255,8 @@ class TimeBlock
 LL_FORCE_INLINE BlockTimer::BlockTimer(TimeBlock& timer)
 {
 #if FAST_TIMER_ON
-	BlockTimerStackRecord* cur_timer_data = ThreadTimerStack::getIfExists();
+	BlockTimerStackRecord* cur_timer_data = LLThreadLocalSingletonPointer<BlockTimerStackRecord>::getInstance();
+	if (!cur_timer_data) return;
 	TimeBlockAccumulator* accumulator = timer.getPrimaryAccumulator();
 	accumulator->mActiveCount++;
 	mBlockStartTotalTimeCounter = accumulator->mTotalTimeCounter;
@@ -293,7 +278,9 @@ LL_FORCE_INLINE BlockTimer::~BlockTimer()
 {
 #if FAST_TIMER_ON
 	U64 total_time = TimeBlock::getCPUClockCount64() - mStartTime;
-	BlockTimerStackRecord* cur_timer_data = ThreadTimerStack::getIfExists();
+	BlockTimerStackRecord* cur_timer_data = LLThreadLocalSingletonPointer<BlockTimerStackRecord>::getInstance();
+	if (!cur_timer_data) return;
+
 	TimeBlockAccumulator* accumulator = cur_timer_data->mTimeBlock->getPrimaryAccumulator();
 
 	accumulator->mCalls++;
diff --git a/indra/llcommon/llqueuedthread.cpp b/indra/llcommon/llqueuedthread.cpp
index 4339f203db73f1f48bea0322ccd7eba106cd6f6d..3689c4728e3c56fee6a3c816469c4761dd17a059 100755
--- a/indra/llcommon/llqueuedthread.cpp
+++ b/indra/llcommon/llqueuedthread.cpp
@@ -470,7 +470,7 @@ S32 LLQueuedThread::processNextRequest()
 			}
 		}
 		
-		LLTrace::get_thread_recorder()->pushToMaster();
+		LLTrace::get_thread_recorder()->pushToParent();
 	}
 
 	S32 pending = getPending();
@@ -502,7 +502,7 @@ void LLQueuedThread::run()
 		
 		if (isQuitting())
 		{
-			LLTrace::get_thread_recorder()->pushToMaster();
+			LLTrace::get_thread_recorder()->pushToParent();
 			endThread();
 			break;
 		}
diff --git a/indra/llcommon/llthread.cpp b/indra/llcommon/llthread.cpp
index e8e546e76944faccbcbe4a95a45360c510e00131..d07cccdf15291ab68c0175a8bc45e36ceaa6e856 100755
--- a/indra/llcommon/llthread.cpp
+++ b/indra/llcommon/llthread.cpp
@@ -93,7 +93,7 @@ void *APR_THREAD_FUNC LLThread::staticRun(apr_thread_t *apr_threadp, void *datap
 {
 	LLThread *threadp = (LLThread *)datap;
 
-	LLTrace::SlaveThreadRecorder thread_recorder(LLTrace::getUIThreadRecorder());
+	LLTrace::ThreadRecorder thread_recorder(LLTrace::getUIThreadRecorder());
 
 #if !LL_DARWIN
 	sThreadID = threadp->mID;
diff --git a/indra/llcommon/llthreadlocalstorage.h b/indra/llcommon/llthreadlocalstorage.h
index d6399d5131f99714cf40b322d612510e0b2a95b8..3b2f5f41939c6b5c70dbf67d987b707f77fcd38b 100644
--- a/indra/llcommon/llthreadlocalstorage.h
+++ b/indra/llcommon/llthreadlocalstorage.h
@@ -124,175 +124,6 @@ class LLThreadLocalPointer : public LLThreadLocalPointerBase
 	bool notNull() const { return sInitialized && get() != NULL; }
 };
 
-template<typename DERIVED_TYPE>
-class LLThreadLocalSingleton
-{
-	typedef enum e_init_state
-	{
-		UNINITIALIZED = 0,
-		CONSTRUCTING,
-		INITIALIZING,
-		INITIALIZED,
-		DELETED
-	} EInitState;
-
-public:
-	LLThreadLocalSingleton()
-	{}
-	
-	virtual ~LLThreadLocalSingleton()
-	{
-#if LL_DARWIN
-        pthread_setspecific(sInstanceKey, NULL);
-#else
-        sData.mInstance = NULL;
-#endif
-		setInitState(DELETED);
-	}
-
-	static void deleteSingleton()
-	{
-		delete getIfExists();
-	}
-
-	static DERIVED_TYPE* getInstance()
-	{
-        EInitState init_state = getInitState();
-		if (init_state == CONSTRUCTING)
-		{
-			llerrs << "Tried to access singleton " << typeid(DERIVED_TYPE).name() << " from singleton constructor!" << llendl;
-		}
-
-		if (init_state == DELETED)
-		{
-			llwarns << "Trying to access deleted singleton " << typeid(DERIVED_TYPE).name() << " creating new instance" << llendl;
-		}
-
-#if LL_DARWIN
-        createTLSInstance();
-#endif
-		if (!getIfExists())
-		{
-			setInitState(CONSTRUCTING);
-            DERIVED_TYPE* instancep = new DERIVED_TYPE();
-#if LL_DARWIN
-            S32 result = pthread_setspecific(sInstanceKey, (void*)instancep);
-            if (result != 0)
-            {
-                llerrs << "Could not set thread local storage" << llendl;
-            }
-#else
-			sData.mInstance = instancep;
-#endif
-			setInitState(INITIALIZING);
-			instancep->initSingleton();
-			setInitState(INITIALIZED);
-		}
-
-        return getIfExists();
-	}
-
-	static DERIVED_TYPE* getIfExists()
-	{
-#if LL_DARWIN
-        return (DERIVED_TYPE*)pthread_getspecific(sInstanceKey);
-#else
-		return sData.mInstance;
-#endif
-	}
-
-	// Reference version of getInstance()
-	// Preferred over getInstance() as it disallows checking for NULL
-	static DERIVED_TYPE& instance()
-	{
-		return *getInstance();
-	}
-
-	// Has this singleton been created uet?
-	// Use this to avoid accessing singletons before the can safely be constructed
-	static bool instanceExists()
-	{
-		return getInitState() == INITIALIZED;
-	}
-
-	// Has this singleton already been deleted?
-	// Use this to avoid accessing singletons from a static object's destructor
-	static bool destroyed()
-	{
-		return getInitState() == DELETED;
-	}
-private:
-#if LL_DARWIN
-    static void createTLSInitState()
-    {
-        static S32 key_created = pthread_key_create(&sInitStateKey, NULL);
-        if (key_created != 0)
-        {
-            llerrs << "Could not create thread local storage" << llendl;
-        }
-    }
-    
-    static void createTLSInstance()
-    {
-        static S32 key_created = pthread_key_create(&sInstanceKey, NULL);
-        if (key_created != 0)
-        {
-            llerrs << "Could not create thread local storage" << llendl;
-        }
-    }
-#endif
-    static EInitState getInitState()
-    {
-#if LL_DARWIN
-        createTLSInitState();
-        return (EInitState)(int)pthread_getspecific(sInitStateKey);
-#else
-        return sData.mInitState;
-#endif
-    }
-    
-    static void setInitState(EInitState state)
-    {
-#if LL_DARWIN
-        createTLSInitState();
-        pthread_setspecific(sInitStateKey, (void*)state);
-#else
-        sData.mInitState = state;
-#endif
-    }
-	LLThreadLocalSingleton(const LLThreadLocalSingleton& other);
-	virtual void initSingleton() {}
-
-	struct SingletonData
-	{
-		DERIVED_TYPE*	mInstance;
-		EInitState		mInitState;
-	};
-#ifdef LL_WINDOWS
-	static __declspec(thread) SingletonData sData;
-#elif LL_LINUX
-	static __thread SingletonData sData;
-#elif LL_DARWIN
-    static pthread_key_t sInstanceKey;
-    static pthread_key_t sInitStateKey;
-#endif
-};
-
-#if LL_WINDOWS
-template<typename DERIVED_TYPE>
-__declspec(thread) typename LLThreadLocalSingleton<DERIVED_TYPE>::SingletonData LLThreadLocalSingleton<DERIVED_TYPE>::sData = {NULL, LLThreadLocalSingleton<DERIVED_TYPE>::UNINITIALIZED};
-#elif LL_LINUX
-template<typename DERIVED_TYPE>
-__thread typename LLThreadLocalSingleton<DERIVED_TYPE>::SingletonData LLThreadLocalSingleton<DERIVED_TYPE>::sData = {NULL, LLThreadLocalSingleton<DERIVED_TYPE>::UNINITIALIZED};
-#elif LL_DARWIN
-template<typename DERIVED_TYPE>
-pthread_key_t LLThreadLocalSingleton<DERIVED_TYPE>::sInstanceKey;
-
-template<typename DERIVED_TYPE>
-pthread_key_t LLThreadLocalSingleton<DERIVED_TYPE>::sInitStateKey;
-
-#endif
-
 template<typename DERIVED_TYPE>
 class LLThreadLocalSingletonPointer
 {
diff --git a/indra/llcommon/lltrace.cpp b/indra/llcommon/lltrace.cpp
index 25807c7b2cd00f7a78d257840dc06307bbbff247..26c19e5121fd9e5753401bfcc86df9082f2d2c7b 100644
--- a/indra/llcommon/lltrace.cpp
+++ b/indra/llcommon/lltrace.cpp
@@ -39,7 +39,7 @@ void init()
 {
 	if (sInitializationCount++ == 0)
 	{
-		gUIThreadRecorder = new MasterThreadRecorder();
+		gUIThreadRecorder = new ThreadRecorder();
 	}
 }
 
diff --git a/indra/llcommon/lltracethreadrecorder.cpp b/indra/llcommon/lltracethreadrecorder.cpp
index d0f0328d1cccd2dd0950a3de73ac106feb71dea9..e88c5bf177cd9dfdb495cc0504eadca35ad9cd64 100644
--- a/indra/llcommon/lltracethreadrecorder.cpp
+++ b/indra/llcommon/lltracethreadrecorder.cpp
@@ -31,7 +31,7 @@
 namespace LLTrace
 {
 
-MasterThreadRecorder* gUIThreadRecorder = NULL;
+ThreadRecorder* gUIThreadRecorder = NULL;
 
 ///////////////////////////////////////////////////////////////////////
 // ThreadRecorder
@@ -39,11 +39,17 @@ MasterThreadRecorder* gUIThreadRecorder = NULL;
 
 ThreadRecorder::ThreadRecorder()
 {
+	init();
+}
+
+void ThreadRecorder::init()
+{
+	LLThreadLocalSingletonPointer<BlockTimerStackRecord>::setInstance(&mBlockTimerStackRecord);
 	//NB: the ordering of initialization in this function is very fragile due to a large number of implicit dependencies
 	set_thread_recorder(this);
 	TimeBlock& root_time_block = TimeBlock::getRootTimeBlock();
 
-	ThreadTimerStack* timer_stack = ThreadTimerStack::getInstance();
+	BlockTimerStackRecord* timer_stack = LLThreadLocalSingletonPointer<BlockTimerStackRecord>::getInstance();
 	timer_stack->mTimeBlock = &root_time_block;
 	timer_stack->mActiveTimer = NULL;
 
@@ -71,8 +77,19 @@ ThreadRecorder::ThreadRecorder()
 	TimeBlock::getRootTimeBlock().getPrimaryAccumulator()->mActiveCount = 1;
 }
 
+
+ThreadRecorder::ThreadRecorder(ThreadRecorder& master)
+:	mMasterRecorder(&master)
+{
+	init();
+	mMasterRecorder->addChildRecorder(this);
+}
+
+
 ThreadRecorder::~ThreadRecorder()
 {
+	LLThreadLocalSingletonPointer<BlockTimerStackRecord>::setInstance(NULL);
+
 	deactivate(&mThreadRecordingBuffers);
 
 	delete mRootTimer;
@@ -85,6 +102,11 @@ ThreadRecorder::~ThreadRecorder()
 
 	set_thread_recorder(NULL);
 	delete[] mTimeBlockTreeNodes;
+
+	if (mMasterRecorder)
+	{
+		mMasterRecorder->removeChildRecorder(this);
+	}
 }
 
 TimeBlockTreeNode* ThreadRecorder::getTimeBlockTreeNode( S32 index )
@@ -190,86 +212,63 @@ void ThreadRecorder::ActiveRecording::movePartialToTarget()
 }
 
 
-///////////////////////////////////////////////////////////////////////
-// SlaveThreadRecorder
-///////////////////////////////////////////////////////////////////////
-
-SlaveThreadRecorder::SlaveThreadRecorder(MasterThreadRecorder& master)
-:	mMasterRecorder(master)
-{
-	mMasterRecorder.addSlaveThread(this);
+// called by child thread
+void ThreadRecorder::addChildRecorder( class ThreadRecorder* child )
+{ LLMutexLock lock(&mChildListMutex);
+	mChildThreadRecorders.push_back(child);
 }
 
-SlaveThreadRecorder::~SlaveThreadRecorder()
+// called by child thread
+void ThreadRecorder::removeChildRecorder( class ThreadRecorder* child )
+{ LLMutexLock lock(&mChildListMutex);
+
+for (child_thread_recorder_list_t::iterator it = mChildThreadRecorders.begin(), end_it = mChildThreadRecorders.end();
+	it != end_it;
+	++it)
 {
-	mMasterRecorder.removeSlaveThread(this);
+	if ((*it) == child)
+	{
+		mChildThreadRecorders.erase(it);
+		break;
+	}
+}
 }
 
-void SlaveThreadRecorder::pushToMaster()
-{ 
+void ThreadRecorder::pushToParent()
+{
 	{ LLMutexLock lock(&mSharedRecordingMutex);	
 		LLTrace::get_thread_recorder()->bringUpToDate(&mThreadRecordingBuffers);
 		mSharedRecordingBuffers.append(mThreadRecordingBuffers);
 	}
 }
+	
+
 
-///////////////////////////////////////////////////////////////////////
-// MasterThreadRecorder
-///////////////////////////////////////////////////////////////////////
 
-static LLFastTimer::DeclareTimer FTM_PULL_TRACE_DATA_FROM_SLAVES("Pull slave trace data");
+static LLFastTimer::DeclareTimer FTM_PULL_TRACE_DATA_FROM_CHILDREN("Pull child thread trace data");
 
-void MasterThreadRecorder::pullFromSlaveThreads()
+void ThreadRecorder::pullFromChildren()
 {
-	/*LLFastTimer _(FTM_PULL_TRACE_DATA_FROM_SLAVES);
+	LLFastTimer _(FTM_PULL_TRACE_DATA_FROM_CHILDREN);
 	if (mActiveRecordings.empty()) return;
 
-	{ LLMutexLock lock(&mSlaveListMutex);
-
-	AccumulatorBufferGroup& target_recording_buffers = mActiveRecordings.back()->mPartialRecording;
-	target_recording_buffers.sync();
-	for (slave_thread_recorder_list_t::iterator it = mSlaveThreadRecorders.begin(), end_it = mSlaveThreadRecorders.end();
-	it != end_it;
-	++it)
-	{ LLMutexLock lock(&(*it)->mSharedRecordingMutex);
-
-	target_recording_buffers.merge((*it)->mSharedRecordingBuffers);
-	(*it)->mSharedRecordingBuffers.reset();
-	}
-	}*/
-}
+	{ LLMutexLock lock(&mChildListMutex);
 
-// called by slave thread
-void MasterThreadRecorder::addSlaveThread( class SlaveThreadRecorder* child )
-{ LLMutexLock lock(&mSlaveListMutex);
+		AccumulatorBufferGroup& target_recording_buffers = mActiveRecordings.back()->mPartialRecording;
+		target_recording_buffers.sync();
+		for (child_thread_recorder_list_t::iterator it = mChildThreadRecorders.begin(), end_it = mChildThreadRecorders.end();
+			it != end_it;
+			++it)
+		{ LLMutexLock lock(&(*it)->mSharedRecordingMutex);
 
-	mSlaveThreadRecorders.push_back(child);
-}
-
-// called by slave thread
-void MasterThreadRecorder::removeSlaveThread( class SlaveThreadRecorder* child )
-{ LLMutexLock lock(&mSlaveListMutex);
-
-	for (slave_thread_recorder_list_t::iterator it = mSlaveThreadRecorders.begin(), end_it = mSlaveThreadRecorders.end();
-		it != end_it;
-		++it)
-	{
-		if ((*it) == child)
-		{
-			mSlaveThreadRecorders.erase(it);
-			break;
+			target_recording_buffers.merge((*it)->mSharedRecordingBuffers);
+			(*it)->mSharedRecordingBuffers.reset();
 		}
 	}
 }
 
-void MasterThreadRecorder::pushToMaster()
-{}
-
-MasterThreadRecorder::MasterThreadRecorder()
-{}
-
 
-MasterThreadRecorder& getUIThreadRecorder()
+ThreadRecorder& getUIThreadRecorder()
 {
 	llassert(gUIThreadRecorder != NULL);
 	return *gUIThreadRecorder;
diff --git a/indra/llcommon/lltracethreadrecorder.h b/indra/llcommon/lltracethreadrecorder.h
index 6b7a8e5865e5fb689bbaed262b28639444bf201c..b5ed77416c14ad9d0bc086e7635b984439db9a51 100644
--- a/indra/llcommon/lltracethreadrecorder.h
+++ b/indra/llcommon/lltracethreadrecorder.h
@@ -43,17 +43,26 @@ namespace LLTrace
 		typedef std::vector<ActiveRecording*> active_recording_list_t;
 	public:
 		ThreadRecorder();
+		explicit ThreadRecorder(ThreadRecorder& master);
 
-		virtual ~ThreadRecorder();
+		~ThreadRecorder();
 
 		void activate(AccumulatorBufferGroup* recording);
 		void deactivate(AccumulatorBufferGroup* recording);
 		active_recording_list_t::reverse_iterator bringUpToDate(AccumulatorBufferGroup* recording);
 
-		virtual void pushToMaster() = 0;
+		void addChildRecorder(class ThreadRecorder* child);
+		void removeChildRecorder(class ThreadRecorder* child);
+
+		// call this periodically to gather stats data from child threads
+		void pullFromChildren();
+		void pushToParent();
 
 		TimeBlockTreeNode* getTimeBlockTreeNode(S32 index);
 
+	protected:
+		void init();
+
 	protected:
 		struct ActiveRecording
 		{
@@ -64,59 +73,31 @@ namespace LLTrace
 
 			void movePartialToTarget();
 		};
-		AccumulatorBufferGroup			mThreadRecordingBuffers;
-
-		active_recording_list_t		mActiveRecordings;
-
-		class BlockTimer*			mRootTimer;
-		TimeBlockTreeNode*			mTimeBlockTreeNodes;
-		size_t						mNumTimeBlockTreeNodes;
-		BlockTimerStackRecord		mBlockTimerStackRecord;
-	};
-
-	class LL_COMMON_API MasterThreadRecorder : public ThreadRecorder
-	{
-	public:
-		MasterThreadRecorder();
-
-		void addSlaveThread(class SlaveThreadRecorder* child);
-		void removeSlaveThread(class SlaveThreadRecorder* child);
-
-		/*virtual */ void pushToMaster();
-
-		// call this periodically to gather stats data from slave threads
-		void pullFromSlaveThreads();
-
-	private:
 
-		typedef std::list<class SlaveThreadRecorder*> slave_thread_recorder_list_t;
+		AccumulatorBufferGroup			mThreadRecordingBuffers;
 
-		slave_thread_recorder_list_t	mSlaveThreadRecorders;	// list of slave thread recorders associated with this master
-		LLMutex							mSlaveListMutex;		// protects access to slave list
-	};
+		BlockTimerStackRecord			mBlockTimerStackRecord;
+		active_recording_list_t			mActiveRecordings;
 
-	class LL_COMMON_API SlaveThreadRecorder : public ThreadRecorder
-	{
-	public:
-		SlaveThreadRecorder(MasterThreadRecorder& master);
-		~SlaveThreadRecorder();
+		class BlockTimer*				mRootTimer;
+		TimeBlockTreeNode*				mTimeBlockTreeNodes;
+		size_t							mNumTimeBlockTreeNodes;
+		typedef std::list<class ThreadRecorder*> child_thread_recorder_list_t;
 
-		// call this periodically to gather stats data for master thread to consume
-		/*virtual*/ void pushToMaster();
+		child_thread_recorder_list_t	mChildThreadRecorders;	// list of child thread recorders associated with this master
+		LLMutex							mChildListMutex;		// protects access to child list
+		LLMutex							mSharedRecordingMutex;
+		AccumulatorBufferGroup			mSharedRecordingBuffers;
+		ThreadRecorder*					mMasterRecorder;
 
-	private:
-		friend class MasterThreadRecorder;
-		LLMutex					mSharedRecordingMutex;
-		AccumulatorBufferGroup	mSharedRecordingBuffers;
-		MasterThreadRecorder&	mMasterRecorder;
 	};
 
 	//FIXME: let user code set up thread recorder topology
-	extern MasterThreadRecorder* gUIThreadRecorder ;
+	extern ThreadRecorder* gUIThreadRecorder ;
 
 	const LLThreadLocalPointer<class ThreadRecorder>& get_thread_recorder();
 	void set_thread_recorder(class ThreadRecorder*);
-	class MasterThreadRecorder& getUIThreadRecorder();
+	ThreadRecorder& getUIThreadRecorder();
 
 }
 
diff --git a/indra/newview/llappviewer.cpp b/indra/newview/llappviewer.cpp
index 9308f0f4e9d20962c9f6bc68ca3093d8c6b3f3d8..348eb43b5e82aa3d8134f4abd3a12a76f2c4f962 100755
--- a/indra/newview/llappviewer.cpp
+++ b/indra/newview/llappviewer.cpp
@@ -1297,7 +1297,7 @@ bool LLAppViewer::mainLoop()
 		LLTrace::get_frame_recording().nextPeriod();
 		LLTrace::TimeBlock::logStats();
 
-		LLTrace::getUIThreadRecorder().pullFromSlaveThreads();
+		LLTrace::getUIThreadRecorder().pullFromChildren();
 
 		//clear call stack records
 		llclearcallstacks;