diff --git a/indra/llcommon/llfasttimer.h b/indra/llcommon/llfasttimer.h
index 5c2df877b0e56dbcdd40cb42064eeafe5ebe2868..c177027f4e2718159897cfe33d27277c29d8ae07 100755
--- a/indra/llcommon/llfasttimer.h
+++ b/indra/llcommon/llfasttimer.h
@@ -27,155 +27,10 @@
 #ifndef LL_FASTTIMER_H
 #define LL_FASTTIMER_H
 
+// Temporarily(?) de-inlined these functions to simplify diagnosis of problems.
+// Implementation of getCPUClockCount32() and getCPUClockCount64 are now in llfastertimer_class.cpp.
+
 // pull in the actual class definition
 #include "llfasttimer_class.h"
 
-//
-// Important note: These implementations must be FAST!
-//
-
-#if LL_WINDOWS
-//
-// Windows implementation of CPU clock
-//
-
-//
-// NOTE: put back in when we aren't using platform sdk anymore
-//
-// because MS has different signatures for these functions in winnt.h
-// need to rename them to avoid conflicts
-//#define _interlockedbittestandset _renamed_interlockedbittestandset
-//#define _interlockedbittestandreset _renamed_interlockedbittestandreset
-//#include <intrin.h>
-//#undef _interlockedbittestandset
-//#undef _interlockedbittestandreset
-
-//inline U32 LLFastTimer::getCPUClockCount32()
-//{
-//	U64 time_stamp = __rdtsc();
-//	return (U32)(time_stamp >> 8);
-//}
-//
-//// return full timer value, *not* shifted by 8 bits
-//inline U64 LLFastTimer::getCPUClockCount64()
-//{
-//	return __rdtsc();
-//}
-
-// shift off lower 8 bits for lower resolution but longer term timing
-// on 1Ghz machine, a 32-bit word will hold ~1000 seconds of timing
-#ifdef USE_RDTSC
-inline U32 LLFastTimer::getCPUClockCount32()
-{
-	U32 ret_val;
-	__asm
-	{
-        _emit   0x0f
-        _emit   0x31
-		shr eax,8
-		shl edx,24
-		or eax, edx
-		mov dword ptr [ret_val], eax
-	}
-    return ret_val;
-}
-
-// return full timer value, *not* shifted by 8 bits
-inline U64 LLFastTimer::getCPUClockCount64()
-{
-	U64 ret_val;
-	__asm
-	{
-        _emit   0x0f
-        _emit   0x31
-		mov eax,eax
-		mov edx,edx
-		mov dword ptr [ret_val+4], edx
-		mov dword ptr [ret_val], eax
-	}
-    return ret_val;
-}
-#else
-LL_COMMON_API U64 get_clock_count(); // in lltimer.cpp
-// These use QueryPerformanceCounter, which is arguably fine and also works on amd architectures.
-inline U32 LLFastTimer::getCPUClockCount32()
-{
-	return (U32)(get_clock_count()>>8);
-}
-
-inline U64 LLFastTimer::getCPUClockCount64()
-{
-	return get_clock_count();
-}
-#endif
-
-#endif
-
-
-#if (LL_LINUX || LL_SOLARIS) && !(defined(__i386__) || defined(__amd64__))
-//
-// Linux and Solaris implementation of CPU clock - non-x86.
-// This is accurate but SLOW!  Only use out of desperation.
-//
-// Try to use the MONOTONIC clock if available, this is a constant time counter
-// with nanosecond resolution (but not necessarily accuracy) and attempts are
-// made to synchronize this value between cores at kernel start. It should not
-// be affected by CPU frequency. If not available use the REALTIME clock, but
-// this may be affected by NTP adjustments or other user activity affecting
-// the system time.
-inline U64 LLFastTimer::getCPUClockCount64()
-{
-	struct timespec tp;
-	
-#ifdef CLOCK_MONOTONIC // MONOTONIC supported at build-time?
-	if (-1 == clock_gettime(CLOCK_MONOTONIC,&tp)) // if MONOTONIC isn't supported at runtime then ouch, try REALTIME
-#endif
-		clock_gettime(CLOCK_REALTIME,&tp);
-
-	return (tp.tv_sec*LLFastTimer::sClockResolution)+tp.tv_nsec;        
-}
-
-inline U32 LLFastTimer::getCPUClockCount32()
-{
-	return (U32)(LLFastTimer::getCPUClockCount64() >> 8);
-}
-#endif // (LL_LINUX || LL_SOLARIS) && !(defined(__i386__) || defined(__amd64__))
-
-
-#if (LL_LINUX || LL_SOLARIS || LL_DARWIN) && (defined(__i386__) || defined(__amd64__))
-//
-// Mac+Linux+Solaris FAST x86 implementation of CPU clock
-inline U32 LLFastTimer::getCPUClockCount32()
-{
-	U64 x;
-	__asm__ volatile (".byte 0x0f, 0x31": "=A"(x));
-	return (U32)(x >> 8);
-}
-
-inline U64 LLFastTimer::getCPUClockCount64()
-{
-	U64 x;
-	__asm__ volatile (".byte 0x0f, 0x31": "=A"(x));
-	return x;
-}
-#endif
-
-
-#if ( LL_DARWIN && !(defined(__i386__) || defined(__amd64__)))
-//
-// Mac PPC (deprecated) implementation of CPU clock
-//
-// Just use gettimeofday implementation for now
-
-inline U32 LLFastTimer::getCPUClockCount32()
-{
-	return (U32)(get_clock_count()>>8);
-}
-
-inline U64 LLFastTimer::getCPUClockCount64()
-{
-	return get_clock_count();
-}
-#endif
-
 #endif // LL_LLFASTTIMER_H
diff --git a/indra/llcommon/llfasttimer_class.cpp b/indra/llcommon/llfasttimer_class.cpp
old mode 100644
new mode 100755
index c45921cdec852503f547bce92af020100286a510..a3e006d70be7d3222c556da761db7386f9a559b6
--- a/indra/llcommon/llfasttimer_class.cpp
+++ b/indra/llcommon/llfasttimer_class.cpp
@@ -35,7 +35,9 @@
 
 #include <boost/bind.hpp>
 
+
 #if LL_WINDOWS
+#include "lltimer.h"
 #elif LL_LINUX || LL_SOLARIS
 #include <sys/time.h>
 #include <sched.h>
@@ -481,6 +483,19 @@ void LLFastTimer::NamedTimer::resetFrame()
 {
 	if (sLog)
 	{ //output current frame counts to performance log
+
+		static S32 call_count = 0;
+		if (call_count % 100 == 0)
+		{
+			llinfos << "countsPerSecond (32 bit): " << countsPerSecond() << llendl;
+			llinfos << "get_clock_count (64 bit): " << get_clock_count() << llendl;
+			llinfos << "LLProcessorInfo().getCPUFrequency() " << LLProcessorInfo().getCPUFrequency() << llendl;
+			llinfos << "getCPUClockCount32() " << getCPUClockCount32() << llendl;
+			llinfos << "getCPUClockCount64() " << getCPUClockCount64() << llendl;
+			llinfos << "elapsed sec " << ((F64)getCPUClockCount64())/((F64)LLProcessorInfo().getCPUFrequency()*1000000.0) << llendl;
+		}
+		call_count++;
+		
 		F64 iclock_freq = 1000.0 / countsPerSecond(); // good place to calculate clock frequency
 
 		F64 total_time = 0;
@@ -762,3 +777,152 @@ LLFastTimer::LLFastTimer(LLFastTimer::FrameState* state)
 
 
 //////////////////////////////////////////////////////////////////////////////
+//
+// Important note: These implementations must be FAST!
+//
+
+
+#if LL_WINDOWS
+//
+// Windows implementation of CPU clock
+//
+
+//
+// NOTE: put back in when we aren't using platform sdk anymore
+//
+// because MS has different signatures for these functions in winnt.h
+// need to rename them to avoid conflicts
+//#define _interlockedbittestandset _renamed_interlockedbittestandset
+//#define _interlockedbittestandreset _renamed_interlockedbittestandreset
+//#include <intrin.h>
+//#undef _interlockedbittestandset
+//#undef _interlockedbittestandreset
+
+//inline U32 LLFastTimer::getCPUClockCount32()
+//{
+//	U64 time_stamp = __rdtsc();
+//	return (U32)(time_stamp >> 8);
+//}
+//
+//// return full timer value, *not* shifted by 8 bits
+//inline U64 LLFastTimer::getCPUClockCount64()
+//{
+//	return __rdtsc();
+//}
+
+// shift off lower 8 bits for lower resolution but longer term timing
+// on 1Ghz machine, a 32-bit word will hold ~1000 seconds of timing
+#ifdef USE_RDTSC
+inline U32 LLFastTimer::getCPUClockCount32()
+{
+	U32 ret_val;
+	__asm
+	{
+        _emit   0x0f
+        _emit   0x31
+		shr eax,8
+		shl edx,24
+		or eax, edx
+		mov dword ptr [ret_val], eax
+	}
+    return ret_val;
+}
+
+// return full timer value, *not* shifted by 8 bits
+inline U64 LLFastTimer::getCPUClockCount64()
+{
+	U64 ret_val;
+	__asm
+	{
+        _emit   0x0f
+        _emit   0x31
+		mov eax,eax
+		mov edx,edx
+		mov dword ptr [ret_val+4], edx
+		mov dword ptr [ret_val], eax
+	}
+    return ret_val;
+}
+#else
+//LL_COMMON_API U64 get_clock_count(); // in lltimer.cpp
+// These use QueryPerformanceCounter, which is arguably fine and also works on amd architectures.
+inline U32 LLFastTimer::getCPUClockCount32()
+{
+	return (U32)(get_clock_count()>>8);
+}
+
+inline U64 LLFastTimer::getCPUClockCount64()
+{
+	return get_clock_count();
+}
+#endif
+
+#endif
+
+
+#if (LL_LINUX || LL_SOLARIS) && !(defined(__i386__) || defined(__amd64__))
+//
+// Linux and Solaris implementation of CPU clock - non-x86.
+// This is accurate but SLOW!  Only use out of desperation.
+//
+// Try to use the MONOTONIC clock if available, this is a constant time counter
+// with nanosecond resolution (but not necessarily accuracy) and attempts are
+// made to synchronize this value between cores at kernel start. It should not
+// be affected by CPU frequency. If not available use the REALTIME clock, but
+// this may be affected by NTP adjustments or other user activity affecting
+// the system time.
+inline U64 LLFastTimer::getCPUClockCount64()
+{
+	struct timespec tp;
+	
+#ifdef CLOCK_MONOTONIC // MONOTONIC supported at build-time?
+	if (-1 == clock_gettime(CLOCK_MONOTONIC,&tp)) // if MONOTONIC isn't supported at runtime then ouch, try REALTIME
+#endif
+		clock_gettime(CLOCK_REALTIME,&tp);
+
+	return (tp.tv_sec*LLFastTimer::sClockResolution)+tp.tv_nsec;        
+}
+
+inline U32 LLFastTimer::getCPUClockCount32()
+{
+	return (U32)(LLFastTimer::getCPUClockCount64() >> 8);
+}
+#endif // (LL_LINUX || LL_SOLARIS) && !(defined(__i386__) || defined(__amd64__))
+
+
+#if (LL_LINUX || LL_SOLARIS || LL_DARWIN) && (defined(__i386__) || defined(__amd64__))
+//
+// Mac+Linux+Solaris FAST x86 implementation of CPU clock
+inline U32 LLFastTimer::getCPUClockCount32()
+{
+	U64 x;
+	__asm__ volatile (".byte 0x0f, 0x31": "=A"(x));
+	return (U32)(x >> 8);
+}
+
+inline U64 LLFastTimer::getCPUClockCount64()
+{
+	U64 x;
+	__asm__ volatile (".byte 0x0f, 0x31": "=A"(x));
+	return x;
+}
+#endif
+
+
+#if ( LL_DARWIN && !(defined(__i386__) || defined(__amd64__)))
+//
+// Mac PPC (deprecated) implementation of CPU clock
+//
+// Just use gettimeofday implementation for now
+
+inline U32 LLFastTimer::getCPUClockCount32()
+{
+	return (U32)(get_clock_count()>>8);
+}
+
+inline U64 LLFastTimer::getCPUClockCount64()
+{
+	return get_clock_count();
+}
+#endif
+