diff --git a/.hgtags b/.hgtags
index 25b4fd7cc675b544a90f6de4aac027e8af282f4a..a3058aa2539827fcc16f7066c88e2c5758102976 100755
--- a/.hgtags
+++ b/.hgtags
@@ -492,3 +492,4 @@ a7872554f3665588f1e8347d472cec3a299254b3 3.7.14-release
 bcc2770e21c125e0bab59141c51db9145aec068d 3.7.17-release
 2729c1daf0257d68a40bdbc4acf1a16184974bbd 3.7.18-release
 82973b38a6c9a457333e3519e4f2b16bb5eedf47 3.7.19-release
+27094824773b907c2e559396e6f9ec3a963de52d 3.7.20-release
diff --git a/indra/llrender/llglslshader.cpp b/indra/llrender/llglslshader.cpp
index 9fae63385df9d3c9bc28829627d6156b46066f6a..b81dd4c9a129cc1152d1bbadb171d51b7b49a3e5 100755
--- a/indra/llrender/llglslshader.cpp
+++ b/indra/llrender/llglslshader.cpp
@@ -87,6 +87,7 @@ LLShaderFeatures::LLShaderFeatures()
 	, mIndexedTextureChannels(0)
 	, disableTextureIndex(false)
 	, hasAlphaMask(false)
+	, attachNothing(false)
 {
 }
 
@@ -119,28 +120,31 @@ struct LLGLSLShaderCompareTimeElapsed
 };
 
 //static
-void LLGLSLShader::finishProfile()
+void LLGLSLShader::finishProfile(bool emit_report)
 {
 	sProfileEnabled = false;
 
-	std::vector<LLGLSLShader*> sorted;
-
-	for (std::set<LLGLSLShader*>::iterator iter = sInstances.begin(); iter != sInstances.end(); ++iter)
+	if (emit_report)
 	{
-		sorted.push_back(*iter);
-	}
+		std::vector<LLGLSLShader*> sorted;
 
-	std::sort(sorted.begin(), sorted.end(), LLGLSLShaderCompareTimeElapsed());
+		for (std::set<LLGLSLShader*>::iterator iter = sInstances.begin(); iter != sInstances.end(); ++iter)
+		{
+			sorted.push_back(*iter);
+		}
 
-	for (std::vector<LLGLSLShader*>::iterator iter = sorted.begin(); iter != sorted.end(); ++iter)
-	{
-		(*iter)->dumpStats();
-	}
+		std::sort(sorted.begin(), sorted.end(), LLGLSLShaderCompareTimeElapsed());
 
+		for (std::vector<LLGLSLShader*>::iterator iter = sorted.begin(); iter != sorted.end(); ++iter)
+		{
+			(*iter)->dumpStats();
+		}
+			
 	LL_INFOS() << "-----------------------------------" << LL_ENDL;
 	LL_INFOS() << "Total rendering time: " << llformat("%.4f ms", sTotalTimeElapsed/1000000.f) << LL_ENDL;
 	LL_INFOS() << "Total samples drawn: " << llformat("%.4f million", sTotalSamplesDrawn/1000000.f) << LL_ENDL;
 	LL_INFOS() << "Total triangles drawn: " << llformat("%.3f million", sTotalTrianglesDrawn/1000000.f) << LL_ENDL;
+	}
 }
 
 void LLGLSLShader::clearStats()
@@ -175,7 +179,7 @@ void LLGLSLShader::dumpStats()
 			}
 		}
 		LL_INFOS() << "=============================================" << LL_ENDL;
-
+	
 		F32 ms = mTimeElapsed/1000000.f;
 		F32 seconds = ms/1000.f;
 
@@ -221,6 +225,7 @@ void LLGLSLShader::placeProfileQuery()
 #if !LL_DARWIN
 	if (mTimerQuery == 0)
 	{
+		glGenQueriesARB(1, &mSamplesQuery);
 		glGenQueriesARB(1, &mTimerQuery);
 	}
 
@@ -257,7 +262,7 @@ void LLGLSLShader::placeProfileQuery()
 	}
 
 
-	glBeginQueryARB(GL_SAMPLES_PASSED, 1);
+	glBeginQueryARB(GL_SAMPLES_PASSED, mSamplesQuery);
 	glBeginQueryARB(GL_TIME_ELAPSED, mTimerQuery);
 #endif
 }
@@ -272,7 +277,7 @@ void LLGLSLShader::readProfileQuery(U32 count, U32 mode)
 	glGetQueryObjectui64v(mTimerQuery, GL_QUERY_RESULT, &time_elapsed);
 
 	U64 samples_passed = 0;
-	glGetQueryObjectui64v(1, GL_QUERY_RESULT, &samples_passed);
+	glGetQueryObjectui64v(mSamplesQuery, GL_QUERY_RESULT, &samples_passed);
 
 	sTotalTimeElapsed += time_elapsed;
 	mTimeElapsed += time_elapsed;
@@ -307,14 +312,15 @@ LLGLSLShader::LLGLSLShader()
 	  mShaderLevel(0), 
 	  mShaderGroup(SG_DEFAULT), 
 	  mUniformsDirty(FALSE),
-	  mTimerQuery(0)
+	  mTimerQuery(0),
+	  mSamplesQuery(0)
+
 {
 	
 }
 
 LLGLSLShader::~LLGLSLShader()
 {
-	
 }
 
 void LLGLSLShader::unload()
@@ -349,6 +355,18 @@ void LLGLSLShader::unload()
 		mProgramObject = 0;
 	}
 	
+	if (mTimerQuery)
+	{
+		glDeleteQueriesARB(1, &mTimerQuery);
+		mTimerQuery = 0;
+	}
+	
+	if (mSamplesQuery)
+	{
+		glDeleteQueriesARB(1, &mSamplesQuery);
+		mSamplesQuery = 0;
+	}
+
 	//hack to make apple not complain
 	glGetError();
 	
diff --git a/indra/llrender/llglslshader.h b/indra/llrender/llglslshader.h
index 7b2f5f04c2f3390108d7023ca4674d7393d83289..5abddf274b4a689779d07bfc24236e2b32fb0999 100755
--- a/indra/llrender/llglslshader.h
+++ b/indra/llrender/llglslshader.h
@@ -51,6 +51,7 @@ class LLShaderFeatures
 	S32 mIndexedTextureChannels;
 	bool disableTextureIndex;
 	bool hasAlphaMask;
+	bool attachNothing;
 
 	// char numLights;
 	
@@ -80,7 +81,7 @@ class LLGLSLShader
 	static bool sNoFixedFunction;
 
 	static void initProfile();
-	static void finishProfile();
+	static void finishProfile(bool emit_report = true);
 
 	static void startProfile();
 	static void stopProfile(U32 count, U32 mode);
@@ -184,6 +185,7 @@ class LLGLSLShader
 
 	//statistcis for profiling shader performance
 	U32 mTimerQuery;
+	U32 mSamplesQuery;
 	U64 mTimeElapsed;
 	static U64 sTotalTimeElapsed;
 	U32 mTrianglesDrawn;
diff --git a/indra/llrender/llrendertarget.cpp b/indra/llrender/llrendertarget.cpp
index 955ea450c1dceb7ff6772e30ee898df69155a002..cd484b4fe901cdc7053a7bacddf1240e26fdf12e 100755
--- a/indra/llrender/llrendertarget.cpp
+++ b/indra/llrender/llrendertarget.cpp
@@ -388,6 +388,7 @@ void LLRenderTarget::release()
 	//
 	if (mFBO && (mTex.size() > 1))
 	{		
+		glBindFramebuffer(GL_FRAMEBUFFER, mFBO);
 		S32 z;
 		for (z = mTex.size() - 1; z >= 1; z--)
 		{
diff --git a/indra/llrender/llshadermgr.cpp b/indra/llrender/llshadermgr.cpp
index 95a2c8b589084a3c5cbdb993621b0ce5a8b6ddbc..a89ec675b4b1a5cf9f9e2fa24474b485fc09d7a9 100755
--- a/indra/llrender/llshadermgr.cpp
+++ b/indra/llrender/llshadermgr.cpp
@@ -73,7 +73,11 @@ BOOL LLShaderMgr::attachShaderFeatures(LLGLSLShader * shader)
 {
 	llassert_always(shader != NULL);
 	LLShaderFeatures *features = & shader->mFeatures;
-	
+
+	if (features->attachNothing)
+	{
+		return TRUE;
+	}
 	//////////////////////////////////////
 	// Attach Vertex Shader Features First
 	//////////////////////////////////////
diff --git a/indra/newview/VIEWER_VERSION.txt b/indra/newview/VIEWER_VERSION.txt
index 82a60c0bb1b133c128bf17d7cbd9ac054714a349..c6cff55cf75f2ee2685968efe2546e65c643cf6f 100644
--- a/indra/newview/VIEWER_VERSION.txt
+++ b/indra/newview/VIEWER_VERSION.txt
@@ -1 +1 @@
-3.7.20
+3.7.21
diff --git a/indra/newview/app_settings/settings.xml b/indra/newview/app_settings/settings.xml
index 041e802626f0c07505b31c303d0f1b24152704b4..94d3c8a59fca5661fef53a4f3ffd6b3cb20a6526 100755
--- a/indra/newview/app_settings/settings.xml
+++ b/indra/newview/app_settings/settings.xml
@@ -9500,7 +9500,7 @@
       <key>Type</key>
       <string>Boolean</string>
       <key>Value</key>
-      <integer>0</integer>
+      <integer>1</integer>
     </map>
     <key>NameTagShowDisplayNames</key>
     <map>
diff --git a/indra/newview/llfeaturemanager.cpp b/indra/newview/llfeaturemanager.cpp
index d0555477eacbdf3aa51220c3ea0432e386d85c95..4db04226341bb136a3775663c0dc6199b19d0e1a 100755
--- a/indra/newview/llfeaturemanager.cpp
+++ b/indra/newview/llfeaturemanager.cpp
@@ -417,13 +417,71 @@ bool LLFeatureManager::parseFeatureTable(std::string filename)
 	return parse_ok;
 }
 
+F32 gpu_benchmark();
+
 bool LLFeatureManager::loadGPUClass()
 {
+	//get memory bandwidth from benchmark
+	F32 gbps = gpu_benchmark();
+
+	if (gbps < 0.f)
+	{ //couldn't bench, use GLVersion
+#if LL_DARWIN
+        //GLVersion is misleading on OSX, just default to class 3 if we can't bench
+        mGPUClass = GPU_CLASS_3;
+#else
+		if (gGLManager.mGLVersion < 2.f)
+		{
+			mGPUClass = GPU_CLASS_0;
+		}
+		else if (gGLManager.mGLVersion < 3.f)
+		{
+			mGPUClass = GPU_CLASS_1;
+		}
+		else if (gGLManager.mGLVersion < 3.3f)
+		{
+			mGPUClass = GPU_CLASS_2;
+		}
+		else if (gGLManager.mGLVersion < 4.f)
+		{
+			mGPUClass = GPU_CLASS_3;
+		}
+		else 
+		{
+			mGPUClass = GPU_CLASS_4;
+		}
+#endif
+	}
+	else if (gbps < 5.f)
+	{
+		mGPUClass = GPU_CLASS_0;
+	}
+	else if (gbps < 10.f)
+	{
+		mGPUClass = GPU_CLASS_1;
+	}
+	else if (gbps < 20.f)
+	{
+		mGPUClass = GPU_CLASS_2;
+	}
+	else if (gbps < 40.f)
+	{
+		mGPUClass = GPU_CLASS_3;
+	}
+	else if (gbps < 80.f)
+	{
+		mGPUClass = GPU_CLASS_4;
+	}
+	else 
+	{
+		mGPUClass = GPU_CLASS_5;
+	}
+	
 	// defaults
-	mGPUClass = GPU_CLASS_UNKNOWN;
 	mGPUString = gGLManager.getRawGLString();
-	mGPUSupported = FALSE;
+	mGPUSupported = TRUE;
 
+#if 0
 	// first table is in the app dir
 	std::string app_path = gDirUtilp->getAppRODataDir();
 	app_path += gDirUtilp->getDirDelimiter();
@@ -451,8 +509,8 @@ bool LLFeatureManager::loadGPUClass()
 	{
 		parse_ok = parseGPUTable(app_path);
 	}
-
-	return parse_ok; // indicates that the file parsed correctly, not that the gpu was recognized
+#endif
+	return true; // indicates that the file parsed correctly, not that the gpu was recognized
 }
 
 	
@@ -730,6 +788,7 @@ void LLFeatureManager::init()
 
 void LLFeatureManager::applyRecommendedSettings()
 {
+	loadGPUClass();
 	// apply saved settings
 	// cap the level at 2 (high)
 	U32 level = llmax(GPU_CLASS_0, llmin(mGPUClass, GPU_CLASS_5));
diff --git a/indra/newview/llglsandbox.cpp b/indra/newview/llglsandbox.cpp
index c386030329db1f06470b74730f62868ec88db7d6..4b8ac2b3cf32407ac4d13f87525c67b66e42799c 100755
--- a/indra/newview/llglsandbox.cpp
+++ b/indra/newview/llglsandbox.cpp
@@ -879,13 +879,32 @@ void LLViewerObjectList::renderObjectBeacons()
 }
 
 
-void gpu_benchmark()
+F32 gpu_benchmark()
 {
-	if (!LLGLSLShader::sNoFixedFunction)
+	if (!gGLManager.mHasShaderObjects)
 	{ //don't bother benchmarking the fixed function
-		return;
+		return -1.f;
 	}
 
+	
+	if (gBenchmarkProgram.mProgramObject == 0)
+	{
+		LLViewerShaderMgr::instance()->initAttribsAndUniforms();
+
+		gBenchmarkProgram.mName = "Benchmark Shader";
+		gBenchmarkProgram.mFeatures.attachNothing = true;
+		gBenchmarkProgram.mShaderFiles.clear();
+		gBenchmarkProgram.mShaderFiles.push_back(std::make_pair("interface/benchmarkV.glsl", GL_VERTEX_SHADER_ARB));
+		gBenchmarkProgram.mShaderFiles.push_back(std::make_pair("interface/benchmarkF.glsl", GL_FRAGMENT_SHADER_ARB));
+		gBenchmarkProgram.mShaderLevel = 1;
+		if (!gBenchmarkProgram.createShader(NULL, NULL))
+		{
+			return -1.f;
+		}
+	}
+
+	LLGLDisable blend(GL_BLEND);
+	
 	//measure memory bandwidth by:
 	// - allocating a batch of textures and render targets
 	// - rendering those textures to those render targets
@@ -909,7 +928,7 @@ void gpu_benchmark()
 	std::vector<F32> results;
 
 	//build a random texture
-	U8 pixels[res*res*4];
+	U8* pixels = new U8[res*res*4];
 
 	for (U32 i = 0; i < res*res*4; ++i)
 	{
@@ -931,6 +950,8 @@ void gpu_benchmark()
 		LLImageGL::setManualImage(GL_TEXTURE_2D, 0, GL_RGBA, res,res,GL_RGBA, GL_UNSIGNED_BYTE, pixels);
 	}
 
+    delete [] pixels;
+
 	//make a dummy triangle to draw with
 	LLPointer<LLVertexBuffer> buff = new LLVertexBuffer(LLVertexBuffer::MAP_VERTEX | LLVertexBuffer::MAP_TEXCOORD0, GL_STATIC_DRAW_ARB);
 	buff->allocateBuffer(3, 0, true);
@@ -951,6 +972,8 @@ void gpu_benchmark()
 	//wait for any previoius GL commands to finish
 	glFinish();
 	
+	bool busted_finish = false;
+
 	for (S32 c = -1; c < samples; ++c)
 	{
 		LLTimer timer;
@@ -965,7 +988,18 @@ void gpu_benchmark()
 		}
 		
 		//wait for current batch of copies to finish
-		glFinish();
+		if (busted_finish)
+		{
+			//read a pixel off the last target since some drivers seem to ignore glFinish
+			dest[count-1].bindTarget();
+			U32 pixel = 0;
+			glReadPixels(0,0,1,1,GL_RGBA, GL_UNSIGNED_BYTE, &pixel);
+			dest[count-1].flush();
+		}
+		else
+		{
+			glFinish();
+		}
 
 		F32 time = timer.getElapsedTimeF32();
 
@@ -976,13 +1010,20 @@ void gpu_benchmark()
 
 			F32 gbps = gb/time;
 
-			results.push_back(gbps);
+			if (!gGLManager.mHasTimerQuery && !busted_finish && gbps > 128.f)
+			{ //unrealistically high bandwidth for a card without timer queries, glFinish is probably ignored
+				busted_finish = true;
+			}
+			else
+			{
+				results.push_back(gbps);
+			}		
 		}
 	}
 
 	gBenchmarkProgram.unbind();
 
-	LLGLSLShader::finishProfile();
+	LLGLSLShader::finishProfile(false);
 	
 	LLImageGL::deleteTextures(count, source);
 
@@ -992,21 +1033,32 @@ void gpu_benchmark()
 	F32 gbps = results[results.size()/2];
 
 	LL_INFOS() << "Memory bandwidth is " << llformat("%.3f", gbps) << "GB/sec according to CPU timers" << LL_ENDL;
-	
-	F32 ms = gBenchmarkProgram.mTimeElapsed/1000000.f;
-	F32 seconds = ms/1000.f;
-
-	F64 samples_drawn = res*res*count*samples;
-	F32 samples_sec = (samples_drawn/1000000000.0)/seconds;
-	gbps = samples_sec*8;
+  
+#if LL_DARWIN
+    if (gbps > 512.f)
+    { 
+        LL_INFOS() << "Memory bandwidth is improbably high and likely incorrect." << LL_ENDL;
+        //OSX is probably lying, discard result
+        gbps = -1.f;
+    }
+#endif
 
 	if (gGLManager.mHasTimerQuery)
 	{
+		F32 ms = gBenchmarkProgram.mTimeElapsed/1000000.f;
+		F32 seconds = ms/1000.f;
+
+		F64 samples_drawn = res*res*count*samples;
+		F32 samples_sec = (samples_drawn/1000000000.0)/seconds;
+		gbps = samples_sec*8;
+
 		LL_INFOS() << "Memory bandwidth is " << llformat("%.3f", gbps) << "GB/sec according to ARB_timer_query" << LL_ENDL;
 	}
 	else
 	{
 		LL_INFOS() << "ARB_timer_query unavailable." << LL_ENDL;
 	}
+
+	return gbps;
 }
 
diff --git a/indra/newview/llviewermenu.cpp b/indra/newview/llviewermenu.cpp
index faf929d8f92ea2c8018713e3e25b4c548f45aa45..3abeba4b43301a0846f89ec988e0a0d6c67a2713 100755
--- a/indra/newview/llviewermenu.cpp
+++ b/indra/newview/llviewermenu.cpp
@@ -7216,7 +7216,7 @@ class LLAdvancedClickRenderProfile: public view_listener_t
 	}
 };
 
-void gpu_benchmark();
+F32 gpu_benchmark();
 
 class LLAdvancedClickRenderBenchmark: public view_listener_t
 {