/* Copyright (C) 2009-2015 Free Software Foundation, Inc. Contributed by Richard Henderson . This file is part of the GNU Transactional Memory Library (libitm). Libitm is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. Libitm is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. Under Section 7 of GPL version 3, you are granted additional permissions described in the GCC Runtime Library Exception, version 3.1, as published by the Free Software Foundation. You should have received a copy of the GNU General Public License and a copy of the GCC Runtime Library Exception along with this program; see the files COPYING3 and COPYING.RUNTIME respectively. If not, see . */ #ifndef LIBITM_CACHELINE_H #define LIBITM_CACHELINE_H 1 // Minimum cacheline size is 32, due to both complex long double and __m256. // There's no requirement that 64-bit use a 64-byte cacheline size, but do // so for now to make sure everything is parameterized properly. #ifdef __x86_64__ # define CACHELINE_SIZE 64 #else # define CACHELINE_SIZE 32 #endif namespace GTM HIDDEN { // A gtm_cacheline_mask stores a modified bit for every modified byte // in the cacheline with which it is associated. typedef sized_integral::type gtm_cacheline_mask; union gtm_cacheline { // Byte access to the cacheline. unsigned char b[CACHELINE_SIZE] __attribute__((aligned(CACHELINE_SIZE))); // Larger sized access to the cacheline. uint16_t u16[CACHELINE_SIZE / sizeof(uint16_t)]; uint32_t u32[CACHELINE_SIZE / sizeof(uint32_t)]; uint64_t u64[CACHELINE_SIZE / sizeof(uint64_t)]; gtm_word w[CACHELINE_SIZE / sizeof(gtm_word)]; #ifdef __MMX__ __m64 m64[CACHELINE_SIZE / sizeof(__m64)]; #endif #ifdef __SSE__ __m128 m128[CACHELINE_SIZE / sizeof(__m128)]; #endif #ifdef __SSE2__ __m128i m128i[CACHELINE_SIZE / sizeof(__m128i)]; #endif #ifdef __AVX__ __m256 m256[CACHELINE_SIZE / sizeof(__m256)]; __m256i m256i[CACHELINE_SIZE / sizeof(__m256i)]; #endif #if defined(__SSE__) || defined(__AVX__) // Copy S to D; only bother defining if we can do this more efficiently // than the compiler-generated default implementation. gtm_cacheline& operator= (const gtm_cacheline &s); #endif // SSE, AVX }; #if defined(__SSE__) || defined(__AVX__) inline gtm_cacheline& ALWAYS_INLINE gtm_cacheline::operator= (const gtm_cacheline & __restrict s) { #ifdef __AVX__ # define CP m256 # define TYPE __m256 #else # define CP m128 # define TYPE __m128 #endif TYPE w, x, y, z; // ??? Wouldn't it be nice to have a pragma to tell the compiler // to completely unroll a given loop? switch (CACHELINE_SIZE / sizeof(TYPE)) { case 1: this->CP[0] = s.CP[0]; break; case 2: x = s.CP[0]; y = s.CP[1]; this->CP[0] = x; this->CP[1] = y; break; case 4: w = s.CP[0]; x = s.CP[1]; y = s.CP[2]; z = s.CP[3]; this->CP[0] = w; this->CP[1] = x; this->CP[2] = y; this->CP[3] = z; break; default: __builtin_trap (); } return *this; #undef CP #undef TYPE } #endif } // namespace GTM #endif // LIBITM_CACHELINE_H