From 6791b060e74b634e36f37ed5d7547a393cd17c32 Mon Sep 17 00:00:00 2001
From: Rye Mutt <rye@alchemyviewer.org>
Date: Wed, 21 Jul 2021 12:53:30 -0400
Subject: [PATCH] Add non-allocating to_chars functionality to LLUUID with SSE
 4.1 and scalar variants

---
 indra/llcommon/lluuid.cpp | 110 ++++++++++++++++++++++++--------------
 indra/llcommon/lluuid.h   |  39 ++++++++++----
 2 files changed, 98 insertions(+), 51 deletions(-)

diff --git a/indra/llcommon/lluuid.cpp b/indra/llcommon/lluuid.cpp
index 499d3bd1692..5bf6533622a 100644
--- a/indra/llcommon/lluuid.cpp
+++ b/indra/llcommon/lluuid.cpp
@@ -152,33 +152,75 @@ U32 janky_fast_random_seeded_bytes(U32 seed, U32 val)
 #endif
 
 // Common to all UUID implementations
-void LLUUID::toString(std::string& out) const
+void LLUUID::to_chars(char* out) const
 {
-	out = fmt::format(FMT_COMPILE("{:02x}{:02x}{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}-{:02x}{:02x}{:02x}{:02x}{:02x}{:02x}"),
-		(U8)(mData[0]),
-		(U8)(mData[1]),
-		(U8)(mData[2]),
-		(U8)(mData[3]),
-		(U8)(mData[4]),
-		(U8)(mData[5]),
-		(U8)(mData[6]),
-		(U8)(mData[7]),
-		(U8)(mData[8]),
-		(U8)(mData[9]),
-		(U8)(mData[10]),
-		(U8)(mData[11]),
-		(U8)(mData[12]),
-		(U8)(mData[13]),
-		(U8)(mData[14]),
-		(U8)(mData[15]));
-}
-
-// *TODO: deprecate
-void LLUUID::toString(char *out) const
-{
-	std::string buffer;
-	toString(buffer);
-	strcpy(out,buffer.c_str()); /* Flawfinder: ignore */
+#if defined(__SSE4_1__)
+    alignas(16) char buffer[UUID_STR_SIZE-1]; // Temporary aligned output buffer for simd op
+    
+    __m128i lower = load_unaligned_si128(mData);
+    __m128i upper = _mm_and_si128(_mm_set1_epi8(0xFF >> 4), _mm_srli_epi32(lower, 4));
+    
+    const __m128i a = _mm_set1_epi8(0x0F);
+    lower = _mm_and_si128(lower, a);
+    upper = _mm_and_si128(upper, a);
+    
+    const __m128i pastNine = _mm_set1_epi8(9 + 1);
+    const __m128i lowerMask = _mm_cmplt_epi8(lower, pastNine);
+    const __m128i upperMask = _mm_cmplt_epi8(upper, pastNine);
+    
+    __m128i letterMask1 = _mm_and_si128(lower, lowerMask);
+    __m128i letterMask2 = _mm_and_si128(upper, upperMask);
+    __m128i letterMask3 = _mm_or_si128(lower, lowerMask);
+    __m128i letterMask4 = _mm_or_si128(upper, upperMask);
+    
+    const __m128i first = _mm_set1_epi8('0');
+    const __m128i second = _mm_set1_epi8('a' - 10);
+    
+    letterMask1 = _mm_add_epi8(letterMask1, first);
+    letterMask2 = _mm_add_epi8(letterMask2, first);
+    letterMask3 = _mm_add_epi8(letterMask3, second);
+    letterMask4 = _mm_add_epi8(letterMask4, second);
+    
+    lower = _mm_blendv_epi8(letterMask3, letterMask1, lowerMask);
+    upper = _mm_blendv_epi8(letterMask4, letterMask2, upperMask);
+    
+    const __m128i mask1 = _mm_shuffle_epi8(lower, _mm_setr_epi8(-1, 0, -1, 1, -1, 2, -1, 3, -1, -1, 4, -1, 5, -1, -1, 6));
+    const __m128i mask2 = _mm_shuffle_epi8(upper, _mm_setr_epi8(0, -1, 1, -1, 2, -1, 3, -1, -1, 4, -1, 5, -1, -1, 6, -1));
+    const __m128i mask3 = _mm_shuffle_epi8(lower, _mm_setr_epi8(-1, 7, -1, -1, 8, -1, 9, -1, -1, 10, -1, 11, -1, 12, -1, 13));
+    const __m128i mask4 = _mm_shuffle_epi8(upper, _mm_setr_epi8(7, -1, -1, 8, -1, 9, -1, -1, 10, -1, 11, -1, 12, -1, 13, -1));
+    const __m128i hypens = _mm_set_epi8(0, 0, '-', 0, 0, 0, 0, '-', 0, 0, 0, 0, 0, 0, 0, 0);
+    const __m128i hypens2 = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, '-', 0, 0, 0, 0, '-', 0, 0);
+    const __m128i upperSorted = _mm_or_si128(_mm_or_si128(mask1, mask2), hypens);
+    const __m128i lowerSorted = _mm_or_si128(_mm_or_si128(mask3, mask4), hypens2);
+    
+    _mm_store_si128(reinterpret_cast<__m128i *>(buffer), upperSorted);
+    _mm_store_si128(reinterpret_cast<__m128i *>(buffer + UUID_BYTES), lowerSorted);
+    
+    // Did not fit the last four chars. Extract and append them.
+    const int v1 = _mm_extract_epi16(upper, 7);
+    const int v2 = _mm_extract_epi16(lower, 7);
+    buffer[32] = (v1 & 0xff);
+    buffer[33] = (v2 & 0xff);
+    buffer[34] = ((v1 >> 8) & 0xff);
+    buffer[35] = ((v2 >> 8) & 0xff);
+    
+    memcpy(out, buffer, UUID_STR_SIZE-1);
+#else
+    for (size_t i = 0; i < UUID_BYTES; ++i)
+    {
+        const auto uuid_byte = mData[i];
+        const size_t hi = ((uuid_byte) >> 4) & 0x0F;
+        *out++ = (i <= 9) ? static_cast<char>('0' + hi) : static_cast<char>('a' + (hi-10));;
+        
+        const size_t lo = (uuid_byte) & 0x0F;
+        *out++ = (i <= 9) ? static_cast<char>('0' + lo) : static_cast<char>('a' + (lo-10));;
+        
+        if (i == 3 || i == 5 || i == 7 || i == 9)
+        {
+            *out++ = '-';
+        }
+    }
+#endif
 }
 
 void LLUUID::toCompressedString(std::string& out) const
@@ -196,18 +238,6 @@ void LLUUID::toCompressedString(char *out) const
 	out[UUID_BYTES] = '\0';
 }
 
-std::string LLUUID::getString() const
-{
-	return asString();
-}
-
-std::string LLUUID::asString() const
-{
-	std::string str;
-	toString(str);
-	return str;
-}
-
 BOOL LLUUID::set(const char* in_string, BOOL emit)
 {
 	return set(absl::NullSafeStringView(in_string),emit);
@@ -416,8 +446,8 @@ LLUUID LLUUID::combine(const LLUUID &other) const
 
 std::ostream& operator<<(std::ostream& s, const LLUUID &uuid)
 {
-	std::string uuid_str;
-	uuid.toString(uuid_str);
+    char uuid_str[37] = {}; // will be null-terminated
+    uuid.to_chars(uuid_str);
 	s << uuid_str;
 	return s;
 }
diff --git a/indra/llcommon/lluuid.h b/indra/llcommon/lluuid.h
index 75148a6a0de..6ac32b67a1f 100644
--- a/indra/llcommon/lluuid.h
+++ b/indra/llcommon/lluuid.h
@@ -40,11 +40,11 @@
 
 class LLMutex;
 
-const S32 UUID_BYTES = 16;
-const S32 UUID_WORDS = 4;
-const S32 UUID_STR_LENGTH = 37;	// actually wrong, should be 36 and use size below
-const S32 UUID_STR_SIZE = 37;
-const S32 UUID_BASE85_LENGTH = 21; // including the trailing NULL.
+static constexpr S32 UUID_BYTES = 16;
+static constexpr S32 UUID_WORDS = 4;
+static constexpr S32 UUID_STR_LENGTH = 37;	// actually wrong, should be 36 and use size below
+static constexpr S32 UUID_STR_SIZE = 37;
+static constexpr S32 UUID_BASE85_LENGTH = 21; // including the trailing NULL.
 
 struct uuid_time_t {
 	U32 high;
@@ -195,8 +195,8 @@ class LL_COMMON_API LLUUID
 			const absl::FormatConversionSpec& spec,
 			absl::FormatSink* s) {
 		if (spec.conversion_char() == absl::FormatConversionChar::s) {
-			std::string uuid_str;
-			id.toString(uuid_str);
+            char uuid_str[UUID_STR_SIZE] = {}; // will be null-terminated
+            id.to_chars(uuid_str);
 			s->Append(uuid_str);
 		}
 		return { true };
@@ -218,13 +218,30 @@ class LL_COMMON_API LLUUID
 	friend LL_COMMON_API std::ostream&	 operator<<(std::ostream& s, const LLUUID &uuid);
 	friend LL_COMMON_API std::istream&	 operator>>(std::istream& s, LLUUID &uuid);
 
-	void toString(char *out) const;		// Does not allocate memory, needs 36 characters (including \0)
-	void toString(std::string& out) const;
+    void to_chars(char* outstr) const; // Does not allocate memory, needs 36 characters (does not null terminate)
+	void toString(char* outstr) const		// Does not allocate memory, needs 37 characters (including \0)
+    {
+        to_chars(outstr);
+        outstr[UUID_STR_SIZE-1] = '\0';
+    }
+
+	void toString(std::string& outstr) const
+    {
+        outstr.resize(UUID_STR_SIZE-1);
+        to_chars(&outstr[0]);
+    }
+    
 	void toCompressedString(char *out) const;	// Does not allocate memory, needs 17 characters (including \0)
 	void toCompressedString(std::string& out) const;
 
-	std::string asString() const;
-	std::string getString() const;
+	std::string asString() const
+    {
+        std::string result(36, char());
+        to_chars(&result[0]);
+        return result;
+    }
+
+	std::string getString() const { return asString(); }
 
 	U16 getCRC16() const;
 	U32 getCRC32() const;
-- 
GitLab