Deduplicate table SQL with nested rows (type STRUCT)

I have a SQL table (in BigQuery) with possible duplicated rows. The table has over 20 columns, some of them are nested (data type “STRUCT)”. I want to deduplicate the table.

I can’t simply query SELECT DISTINCT * because I get an error

Query error: Column options of type STRUCT cannot be used in SELECT DISTINCT

So far, I tried to create a unique ID based on a hash of certain columns. I have now this unique ID (called sha256), but I can’t figure out a way of selecting only rows with unique hash.

I tried to GROUP BY, but it doesn’t work with STRUCT type, and I tried also to INNER JOIN with a table containing only unique hashed, but I get duplicates also.

For reference, here are 2 example rows of the dataset:

  {
    "sha256": "un2k3TUtzwzmQMvxfrjztsh/A/GW3WWzV4U4CezqceA=",
    "has_phone": true,
    "options": {
      "sub_toplist": true,
      "gallery": false,
      "urgent": false,
      "has_option": true,
      "photosup": true,
      "booster": false
    },
    "calendar": {
      "dates": null
    },
    "owner": {
      "siren": null,
      "pro_rates_link": null,
      "user_id": "f0d94687-1a24-4ed4-8adb-7faded053ca8",
      "type": "private",
      "no_salesmen": true,
      "name": "marius",
      "store_id": "5022456"
    },
    "location": {
      "feature": {
        "properties": null,
        "geometry": {
          "coordinates": [
            "9.41733",
            "42.54701"
          ],
          "type": "Point"
        },
        "type": "Feature"
      },
      "is_shape": true,
      "provider": "here",
      "lng": "9.41733",
      "lat": "42.54701",
      "zipcode": "20290",
      "city_label": "Lucciana 20290",
      "city": "Lucciana",
      "region_name": "Corse",
      "department_name": null,
      "source": "city",
      "department_id": "0",
      "region_id": "9"
    },
    "attributes": {
      "pro_rates_link": null,
      "immo_sell_type": "old",
      "ges": "a",
      "square": "92",
      "rooms": "4",
      "energy_rate": "b",
      "is_import": false,
      "custom_ref": null,
      "lease_type": "sell",
      "real_estate_type": "1",
      "fai_included": null,
      "type_real_estate_sale": null
    },
    "price_calendar": null,
    "price": [
      "270000"
    ],
    "body": "text",
    "url": "https://www.example.fr/ventes_immobilieres/1729537955.htm",
    "category_name": "Ventes immobilières",
    "category_id": "9",
    "images": {
      "urls_thumb": [
        "https://img3.example.fr/ad-thumb/d63e236ce3546906b3ce661640a7cf858d0a0593.jpg"
      ],
      "urls": [
        "https://img3.example.fr/ad-image/ac6bd9ce0cc3aa507727ddece51f437d77ae4cfa.jpg",
      ],
      "nb_images": "7",
      "small_url": "https://img3.example.fr/ad-small/ac6bd9ce0cc3aa507727ddece51f437d77ae4cfa.jpg",
      "thumb_url": "https://img3.example.fr/ad-thumb/ac6bd9ce0cc3aa507727ddece51f437d77ae4cfa.jpg"
    },
    "ad_type": "offer",
    "first_publication_date": "2020-01-02 15:00:46 UTC",
    "status": "active",
    "subject": "Villa à Lucciana",
    "index_date": "2020-01-16 15:00:45 UTC",
    "expiration_date": "2020-03-02 15:00:46 UTC",
    "list_id": "1729537955"
  },
  {
    "sha256": "wCMrggkqSJ3PgbkuWAgBpCMtFfkJDRlz6TOeO5Nngsg=",
    "has_phone": true,
    "options": {
      "sub_toplist": false,
      "gallery": false,
      "urgent": false,
      "has_option": false,
      "photosup": false,
      "booster": false
    },
    "calendar": {
      "dates": null
    },
    "owner": {
      "siren": null,
      "pro_rates_link": null,
      "user_id": "ae0f432d-0aa2-4828-a20b-3472255588b4",
      "type": "private",
      "no_salesmen": true,
      "name": "M.Milleliri",
      "store_id": "12132533"
    },
    "location": {
      "feature": {
        "properties": null,
        "geometry": {
          "coordinates": [
            "9.1917",
            "41.54506"
          ],
          "type": "Point"
        },
        "type": "Feature"
      },
      "is_shape": true,
      "provider": "here",
      "lng": "9.1917",
      "lat": "41.54506",
      "zipcode": "20146",
      "city_label": "Sotta 20146",
      "city": "Sotta",
      "region_name": "Corse",
      "department_name": null,
      "source": "city",
      "department_id": "0",
      "region_id": "9"
    },
    "attributes": {
      "pro_rates_link": null,
      "immo_sell_type": "old",
      "ges": "Non renseigné",
      "square": null,
      "rooms": null,
      "energy_rate": "Non renseigné",
      "is_import": false,
      "custom_ref": null,
      "lease_type": "sell",
      "real_estate_type": "3",
      "fai_included": null,
      "type_real_estate_sale": null
    },
    "price_calendar": null,
    "price": [
      "100000"
    ],
    "body": "text",
    "url": "https://www.example.fr/ventes_immobilieres/1736199673.htm",
    "category_name": "Ventes immobilières",
    "category_id": "9",
    "images": {
      "urls_thumb": [
        "https://img3.example.fr/ad-thumb/4f3632dc8e5c50075aa6c6e4b559e2042546f009.jpg"
      ],
      "urls": [
        "https://img3.example.fr/ad-image/4f3632dc8e5c50075aa6c6e4b559e2042546f009.jpg"
      ],
      "urls_large": [
        "https://img3.example.fr/ad-large/4f3632dc8e5c50075aa6c6e4b559e2042546f009.jpg"
      ],
      "nb_images": "1",
      "small_url": "https://img3.example.fr/ad-small/4f3632dc8e5c50075aa6c6e4b559e2042546f009.jpg",
      "thumb_url": "https://img3.example.fr/ad-thumb/4f3632dc8e5c50075aa6c6e4b559e2042546f009.jpg"
    },
    "ad_type": "offer",
    "first_publication_date": "2020-01-16 14:21:05 UTC",
    "status": "active",
    "subject": "Terrain 1250 m2 Sotta",
    "index_date": "2020-01-16 14:21:05 UTC",
    "expiration_date": "2020-03-16 14:21:05 UTC",
    "list_id": "1736199673"
  }

​x
 
  {    "sha256": "un2k3TUtzwzmQMvxfrjztsh/A/GW3WWzV4U4CezqceA=",    "has_phone": true,    "options": {      "sub_toplist": true,      "gallery": false,      "urgent": false,      "has_option": true,      "photosup": true,      "booster": false    },    "calendar": {      "dates": null    },    "owner": {      "siren": null,      "pro_rates_link": null,      "user_id": "f0d94687-1a24-4ed4-8adb-7faded053ca8",      "type": "private",      "no_salesmen": true,      "name": "marius",      "store_id": "5022456"    },    "location": {      "feature": {        "properties": null,        "geometry": {          "coordinates": [            "9.41733",            "42.54701"          ],          "type": "Point"        },        "type": "Feature"      },      "is_shape": true,      "provider": "here",      "lng": "9.41733",      "lat": "42.54701",      "zipcode": "20290",      "city_label": "Lucciana 20290",      "city": "Lucciana",      "region_name": "Corse",      "department_name": null,      "source": "city",      "department_id": "0",      "region_id": "9"    },    "attributes": {      "pro_rates_link": null,      "immo_sell_type": "old",      "ges": "a",      "square": "92",      "rooms": "4",      "energy_rate": "b",      "is_import": false,      "custom_ref": null,      "lease_type": "sell",      "real_estate_type": "1",      "fai_included": null,      "type_real_estate_sale": null    },    "price_calendar": null,    "price": [      "270000"    ],    "body": "text",    "url": "https://www.example.fr/ventes_immobilieres/1729537955.htm",    "category_name": "Ventes immobilières",    "category_id": "9",    "images": {      "urls_thumb": [        "https://img3.example.fr/ad-thumb/d63e236ce3546906b3ce661640a7cf858d0a0593.jpg"      ],      "urls": [        "https://img3.example.fr/ad-image/ac6bd9ce0cc3aa507727ddece51f437d77ae4cfa.jpg",      ],      "nb_images": "7",      "small_url": "https://img3.example.fr/ad-small/ac6bd9ce0cc3aa507727ddece51f437d77ae4cfa.jpg",      "thumb_url": "https://img3.example.fr/ad-thumb/ac6bd9ce0cc3aa507727ddece51f437d77ae4cfa.jpg"    },    "ad_type": "offer",    "first_publication_date": "2020-01-02 15:00:46 UTC",    "status": "active",    "subject": "Villa à Lucciana",    "index_date": "2020-01-16 15:00:45 UTC",    "expiration_date": "2020-03-02 15:00:46 UTC",    "list_id": "1729537955"  },  {    "sha256": "wCMrggkqSJ3PgbkuWAgBpCMtFfkJDRlz6TOeO5Nngsg=",    "has_phone": true,    "options": {      "sub_toplist": false,      "gallery": false,      "urgent": false,      "has_option": false,      "photosup": false,      "booster": false    },    "calendar": {      "dates": null    },    "owner": {      "siren": null,      "pro_rates_link": null,      "user_id": "ae0f432d-0aa2-4828-a20b-3472255588b4",      "type": "private",      "no_salesmen": true,      "name": "M.Milleliri",      "store_id": "12132533"    },    "location": {      "feature": {        "properties": null,        "geometry": {          "coordinates": [            "9.1917",            "41.54506"          ],          "type": "Point"        },        "type": "Feature"      },      "is_shape": true,      "provider": "here",      "lng": "9.1917",      "lat": "41.54506",      "zipcode": "20146",      "city_label": "Sotta 20146",      "city": "Sotta",      "region_name": "Corse",      "department_name": null,      "source": "city",      "department_id": "0",      "region_id": "9"    },    "attributes": {      "pro_rates_link": null,      "immo_sell_type": "old",      "ges": "Non renseigné",      "square": null,      "rooms": null,      "energy_rate": "Non renseigné",      "is_import": false,      "custom_ref": null,      "lease_type": "sell",      "real_estate_type": "3",      "fai_included": null,      "type_real_estate_sale": null    },    "price_calendar": null,    "price": [      "100000"    ],    "body": "text",    "url": "https://www.example.fr/ventes_immobilieres/1736199673.htm",    "category_name": "Ventes immobilières",    "category_id": "9",    "images": {      "urls_thumb": [        "https://img3.example.fr/ad-thumb/4f3632dc8e5c50075aa6c6e4b559e2042546f009.jpg"      ],      "urls": [        "https://img3.example.fr/ad-image/4f3632dc8e5c50075aa6c6e4b559e2042546f009.jpg"      ],      "urls_large": [        "https://img3.example.fr/ad-large/4f3632dc8e5c50075aa6c6e4b559e2042546f009.jpg"      ],      "nb_images": "1",      "small_url": "https://img3.example.fr/ad-small/4f3632dc8e5c50075aa6c6e4b559e2042546f009.jpg",      "thumb_url": "https://img3.example.fr/ad-thumb/4f3632dc8e5c50075aa6c6e4b559e2042546f009.jpg"    },    "ad_type": "offer",    "first_publication_date": "2020-01-16 14:21:05 UTC",    "status": "active",    "subject": "Terrain 1250 m2 Sotta",    "index_date": "2020-01-16 14:21:05 UTC",    "expiration_date": "2020-03-16 14:21:05 UTC",    "list_id": "1736199673"  }​

and the query I’m working on so far:

WITH
  table_unique_hash AS (
  SELECT
    DISTINCT(SHA256(CONCAT(FORMAT_TIMESTAMP('%Y/%m/%d_%H:%M:%S_', index_date), CAST(list_id AS STRING)))) AS sha256
  FROM
    `test_bucket_data.daily_table`),

  table_hashed AS (
  SELECT
    SHA256(CONCAT(FORMAT_TIMESTAMP('%Y/%m/%d_%H:%M:%S_', index_date), CAST(list_id AS STRING))) AS sha256, *
  FROM
    `test_bucket_data.daily_table`)
SElECT * FROM table_hashed
limit 10;

 
WITH  table_unique_hash AS (  SELECT    DISTINCT(SHA256(CONCAT(FORMAT_TIMESTAMP('%Y/%m/%d_%H:%M:%S_', index_date), CAST(list_id AS STRING)))) AS sha256  FROM    `test_bucket_data.daily_table`),​  table_hashed AS (  SELECT    SHA256(CONCAT(FORMAT_TIMESTAMP('%Y/%m/%d_%H:%M:%S_', index_date), CAST(list_id AS STRING))) AS sha256, *  FROM    `test_bucket_data.daily_table`)SElECT * FROM table_hashedlimit 10;​

A solution would be to find a way of inner join the table_hashed and the table_unique_hash on the sha256 column…

Thanks for your help!

Answer

I found a workaround based on this topic. The combination of GROUP BY and ANY function for all the STRUCT columns made it work!

SELECT
  has_phone,
  ANY_VALUE(options) as options,
  ANY_VALUE(calendar) as calendar,
  ANY_VALUE(owner) as owner,
  ANY_VALUE(location) as location,
  ANY_VALUE(attributes) as attributes,
  price_calendar,
  price,
  body,
  url,
  category_name,
  category_id,
  ANY_VALUE(images) as images,
  ad_type,
  first_publication_date,
  status,
  subject,
  index_date,
  expiration_date,
  list_id,
FROM
  `{table_name}`
Group by
  has_phone,
  price_calendar,
  price,
  body,
  url,
  category_name,
  category_id,
  ad_type,
  first_publication_date,
  status,
  subject,
  index_date,
  expiration_date,
  list_id

 
SELECT  has_phone,  ANY_VALUE(options) as options,  ANY_VALUE(calendar) as calendar,  ANY_VALUE(owner) as owner,  ANY_VALUE(location) as location,  ANY_VALUE(attributes) as attributes,  price_calendar,  price,  body,  url,  category_name,  category_id,  ANY_VALUE(images) as images,  ad_type,  first_publication_date,  status,  subject,  index_date,  expiration_date,  list_id,FROM  `{table_name}`Group by  has_phone,  price_calendar,  price,  body,  url,  category_name,  category_id,  ad_type,  first_publication_date,  status,  subject,  index_date,  expiration_date,  list_id​

Note: my “price” field was previously an array; I transformed it in my source json to an int

Advertisement

Answer