Concept Validation Queries

The following SQL queries (based on rules defined within this Talk discussion) may be used to check for concept validation rule violations within an OpenMRS database. Any results (other than an empty set) from any of the queries represents validation issues.

-- ==========================================================
-- List concepts with multiple preferred names in same locale
-- ==========================================================
SELECT cn1.concept_id, locale, name, locale_preferred
FROM concept_name cn1
WHERE locale_preferred
AND NOT voided
AND EXISTS
	(SELECT COUNT(*) as n, cn2.concept_id, locale, name, locale_preferred
  FROM concept_name cn2
  WHERE cn2.concept_id = cn1.concept_id
  AND cn2.locale = cn1.locale
  AND locale_preferred
  AND NOT voided
  GROUP BY cn2.concept_id, locale
  HAVING n > 1)
ORDER BY cn1.concept_id, locale, name;

-- ========================================================================
-- List concepts with duplicate name in same locale (excluding short names)
-- ========================================================================
SELECT cn.concept_id, cn.name, cn.locale
FROM concept_name cn
INNER JOIN (
    SELECT concept_id, name, locale
    FROM concept_name
    WHERE concept_name_type <> 'SHORT' AND NOT voided
    GROUP BY concept_id, name, locale
    HAVING COUNT(*) > 1
) AS subquery
ON cn.concept_id = subquery.concept_id
  AND cn.name = subquery.name
  AND cn.locale = subquery.locale
WHERE cn.concept_name_type <> 'SHORT' AND NOT cn.voided;

-- ==============================================================
-- List concepts with more than one short name in the same locale
-- ==============================================================
SELECT concept_id, locale, name, concept_name_type
FROM concept_name cn1
WHERE concept_name_type = 'SHORT'
AND EXISTS (
  SELECT COUNT(*) as n, concept_id, locale
  FROM concept_name cn2
  WHERE cn2.concept_name_type = cn1.concept_name_type
  AND cn2.concept_id = cn1.concept_id
  AND cn2.locale = cn1.locale
  AND NOT cn2.voided
  GROUP BY cn2.concept_id, cn2.locale
  HAVING n > 1)
ORDER BY concept_id, locale, name;

-- ==========================================================
-- List any concept with short name marked as local preferred
-- ==========================================================
SELECT concept_id, name, locale, locale_preferred
FROM concept_name
WHERE concept_name_type = 'SHORT'
AND locale_preferred
AND NOT voided
ORDER BY concept_id, locale;

-- ======================================================================
-- List any concept with more than one fully specified name in any locale
-- ======================================================================
SELECT concept_id, locale, name, concept_name_type
FROM concept_name cn1
WHERE concept_name_type = 'FULLY_SPECIFIED'
AND EXISTS (
  SELECT COUNT(*) as n, concept_id, locale
  FROM concept_name cn2
  WHERE cn2.concept_name_type = cn1.concept_name_type
  AND cn2.concept_id = cn1.concept_id
  AND cn2.locale = cn1.locale
  AND NOT cn2.voided
  GROUP BY cn2.concept_id, cn2.locale
  HAVING n > 1)
ORDER BY concept_id, locale, name;

-- =======================================================================
-- List any concepts that do not have a fully specified name in any locale
-- =======================================================================
SELECT DISTINCT concept_id
FROM concept_name cn1
WHERE NOT EXISTS (
  SELECT null FROM concept_name cn2
  WHERE cn2.concept_id = cn1.concept_id
  AND concept_name_type = 'FULLY_SPECIFIED'
  AND NOT cn2.voided)
GROUP BY concept_id
ORDER BY concept_id;

-- =============================
-- List invalid values name type
-- =============================
SELECT concept_id, name, concept_name_type
FROM concept_name
WHERE concept_name_type NOT IN ('FULLY_SPECIFIED', 'INDEX_TERM', 'SHORT')
AND NOT voided
ORDER BY concept_name_type;

-- =========================================
-- List invalid/unexpected values for locale
-- =========================================
SELECT concept_id, name, locale
FROM concept_name
WHERE locale NOT IN (
  'am', 'ar', 'bn', 'en', 'es', 'fr', 'ht', 'id', 'it', 'km',
  'nl', 'om', 'pt', 'ru', 'rw', 'sw', 'ti', 'ur', 'vi')
AND NOT voided
ORDER BY locale;

-- ===============================================================================
-- List any duplicated fully specified names (ignoring short names and index term)
-- ===============================================================================
-- Create indexed table of unvoided names of unretired concepts
-- with lowercase name for faster case-insensitive name comparisons
-- (avoid temporary table in MySQL 5.6 - https://bit.ly/MySQL5-6temp)
DROP TABLE IF EXISTS concept_name_lower;
CREATE TABLE concept_name_lower (
  concept_id int(11) DEFAULT NULL,
  locale varchar(50) CHARACTER SET utf8mb4 NOT NULL DEFAULT '',
  locale_preferred tinyint(1) DEFAULT 0,
  name varchar(255) CHARACTER SET utf8mb4 NOT NULL DEFAULT '',
  concept_name_type varchar(50) CHARACTER SET utf8mb4 DEFAULT NULL,
  lname varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin DEFAULT NULL,
  INDEX (concept_id),
  INDEX (locale),
  INDEX (lname),
  INDEX (name),
  INDEX (concept_name_type)
)
SELECT cn.concept_id, locale, locale_preferred, name, 
  concept_name_type, LOWER(name) AS lname
FROM concept_name cn
JOIN concept c ON c.concept_id = cn.concept_id
WHERE NOT voided
AND NOT retired;
 
-- List any duplicated fully specified names (ignoring short names and index terms)
SELECT cn1.concept_id, cn1.locale, cn1.locale_preferred, 
  cn1.name, cn1.concept_name_type
FROM concept_name_lower cn1
JOIN (
  SELECT cn2.lname, cn2.locale
  FROM concept_name_lower cn2
  WHERE (
    cn2.concept_name_type IS NULL
    AND cn2.locale_preferred
  )
  OR cn2.concept_name_type = 'FULLY_SPECIFIED'
  GROUP BY cn2.lname, cn2.locale
  HAVING COUNT(*) > 1
) dup ON dup.lname = cn1.lname AND dup.locale = cn1.locale
WHERE cn1.concept_name_type = 'FULLY_SPECIFIED'
OR EXISTS (
  SELECT NULL
  FROM concept_name_lower cn3
  WHERE cn3.lname = cn1.lname
  AND cn3.locale = cn1.locale
  AND cn3.concept_name_type = 'FULLY_SPECIFIED'
)
ORDER BY cn1.name, cn1.locale;
 
-- Clean up
DROP TABLE concept_name_lower;

-- ==========================================================
-- List any duplicated preferred names within the same locale
-- ==========================================================
-- Create indexed table of unvoided preferred names of unretired concepts 
-- with lowercase name for faster case-insensitive name comparisons
-- (avoid temporary table in MySQL 5.6 - https://bit.ly/MySQL5-6temp)
DROP TABLE IF EXISTS concept_name_preferred_lower;
CREATE TABLE concept_name_preferred_lower (
  concept_id int(11) DEFAULT NULL,
  locale varchar(50) CHARACTER SET utf8mb4 NOT NULL DEFAULT '',
  name varchar(255) CHARACTER SET utf8mb4 NOT NULL DEFAULT '',
  lname varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin DEFAULT NULL,
  INDEX (concept_id),
  INDEX (locale),
  INDEX (lname),
  INDEX (name)
)
SELECT cn.concept_id, locale, name, LOWER(name) AS lname
FROM concept_name cn
JOIN concept c ON c.concept_id = cn.concept_id
WHERE locale_preferred
AND NOT voided
AND NOT retired;

-- List any duplicated preferred names within the same locale
SELECT cn1.concept_id, cn1.locale, cn1.name
FROM concept_name_preferred_lower cn1
JOIN (
  SELECT cn2.lname, cn2.locale
  FROM concept_name_preferred_lower cn2
  GROUP BY cn2.lname, cn2.locale
  HAVING COUNT(*) > 1
) AS duplicates ON cn1.lname = duplicates.lname AND cn1.locale = duplicates.locale
ORDER BY cn1.name, cn1.locale, cn1.concept_id;

-- Clean up
DROP TABLE concept_name_preferred_lower;

-- ===============================================================
-- List any concepts with UUID that is not 36 characters in length
-- ===============================================================
SELECT concept_id, uuid
FROM concept
WHERE length(uuid) != 36
AND NOT retired
ORDER BY concept_id;

-- =============================================================================
-- List any duplicate mappings (same concept mapped to same code more than once)
-- =============================================================================
SELECT
  m1.concept_map_id,
  m1.concept_id,
  (SELECT name FROM concept_name n WHERE n.concept_id=m1.concept_id AND locale='en' AND locale_preferred) as name,
  (SELECT name FROM concept_map_type t WHERE t.concept_map_type_id = m1.concept_map_type_id) as type,
  (SELECT name FROM concept_reference_source WHERE concept_source_id = r1.concept_source_id) as source,
  r1.code
FROM
  concept_reference_map m1
  INNER JOIN concept_reference_map m2 ON m1.concept_id = m2.concept_id
  LEFT OUTER JOIN concept_reference_term r1 ON r1.concept_reference_term_id = m1.concept_reference_term_id
  LEFT OUTER JOIN concept_reference_term r2 ON r2.concept_reference_term_id = m2.concept_reference_term_id
WHERE
  m1.concept_map_id != m2.concept_map_id AND
  r1.concept_source_id = r2.concept_source_id AND
  r1.code = r2.code
ORDER BY m1.concept_id, r1.code, type;