Skip to content

Commit

Permalink
HIVE-28249: Parquet legacy timezone conversion converts march 1st to …
Browse files Browse the repository at this point in the history
…29th feb and fails with not a leap year exception .(#5241) (Simhadri Govindappa , reviewed by Denys Kuzmenko and  Stamatis Zampetakis )
  • Loading branch information
simhadri-g committed May 16, 2024
1 parent b7093e3 commit ea70fb8
Show file tree
Hide file tree
Showing 7 changed files with 214 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -191,12 +191,17 @@ public static Timestamp valueOf(String s) {
try {
localDateTime = LocalDateTime.parse(s);
} catch (DateTimeException e2) {
throw new IllegalArgumentException("Cannot create timestamp, parsing error " + s);
throw new IllegalArgumentException("Cannot create timestamp, parsing error " + s, e);
}
}
return new Timestamp(localDateTime);
}

public Timestamp minusDays(long days) {
localDateTime = localDateTime.minusDays(days);
return this;
}

public static Timestamp getTimestampFromTime(String s) {
return new Timestamp(LocalDateTime.of(LocalDate.now(),
LocalTime.parse(s, DateTimeFormatter.ISO_LOCAL_TIME)));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.LocalTime;
import java.time.Month;
import java.time.ZoneId;
import java.time.ZoneOffset;
import java.time.ZonedDateTime;
Expand All @@ -39,8 +40,7 @@
import java.util.TimeZone;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.hive.common.util.DateUtils;
import org.apache.commons.lang3.time.DateUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -50,6 +50,7 @@
import static java.time.temporal.ChronoField.MONTH_OF_YEAR;
import static java.time.temporal.ChronoField.SECOND_OF_MINUTE;
import static java.time.temporal.ChronoField.YEAR;
import static org.apache.hive.common.util.DateUtils.NANOS_PER_SEC;

public class TimestampTZUtil {

Expand All @@ -58,6 +59,8 @@ public class TimestampTZUtil {
private static final LocalTime DEFAULT_LOCAL_TIME = LocalTime.of(0, 0);
private static final Pattern SINGLE_DIGIT_PATTERN = Pattern.compile("[\\+-]\\d:\\d\\d");

public static final String NOT_LEAP_YEAR = "not a leap year";

private static final DateTimeFormatter TIMESTAMP_FORMATTER = new DateTimeFormatterBuilder()
// Date and Time Parts
.appendValue(YEAR, 4, 10, SignStyle.NORMAL).appendLiteral('-').appendValue(MONTH_OF_YEAR, 2, 2, SignStyle.NORMAL)
Expand Down Expand Up @@ -193,7 +196,18 @@ public static Timestamp convertTimestampToZone(Timestamp ts, ZoneId fromZone, Zo
java.util.Date date = formatter.parse(ts.format(TIMESTAMP_FORMATTER));
// Set the formatter to use a different timezone
formatter.setTimeZone(TimeZone.getTimeZone(toZone));
Timestamp result = Timestamp.valueOf(formatter.format(date));
Timestamp result;
try {
result = Timestamp.valueOf(formatter.format(date));
} catch (IllegalArgumentException ex) {
if (ex.getCause().getMessage().contains(NOT_LEAP_YEAR)) {
int leapYearDateAdjustment = ts.getMonth() == Month.MARCH.getValue() ? 2 : -2;
result = Timestamp.valueOf(formatter.format(DateUtils.addDays(date, leapYearDateAdjustment)))
.minusDays(leapYearDateAdjustment);
} else {
throw ex;
}
}
result.setNanos(ts.getNanos());
return result;
} catch (ParseException e) {
Expand All @@ -211,6 +225,6 @@ public static Timestamp convertTimestampToZone(Timestamp ts, ZoneId fromZone, Zo
}

public static double convertTimestampTZToDouble(TimestampTZ timestampTZ) {
return timestampTZ.getEpochSecond() + timestampTZ.getNanos() / DateUtils.NANOS_PER_SEC;
return timestampTZ.getEpochSecond() + timestampTZ.getNanos() / NANOS_PER_SEC;
}
}
Binary file added data/files/legacyLeapYearInSingaporeTZ
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
*/
package org.apache.hadoop.hive.ql.io.parquet.timestamp;

import java.time.DateTimeException;
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.time.ZoneOffset;
import java.util.Calendar;
Expand Down Expand Up @@ -100,12 +102,23 @@ public static Timestamp getTimestamp(NanoTime nt, ZoneId targetZone, boolean leg
julianDay--;
}

JulianDate jDateTime;
jDateTime = JulianDate.of((double) julianDay);
JulianDate jDateTime = JulianDate.of((double) julianDay);
LocalDateTime localDateTime;
int leapYearDateAdjustment = 0;
try {
localDateTime = jDateTime.toLocalDateTime();
} catch (DateTimeException e) {
if (e.getMessage().contains(TimestampTZUtil.NOT_LEAP_YEAR) && legacyConversion) {
leapYearDateAdjustment = 2;
localDateTime = jDateTime.add(leapYearDateAdjustment).toLocalDateTime();
} else {
throw e;
}
}
Calendar calendar = getGMTCalendar();
calendar.set(Calendar.YEAR, jDateTime.toLocalDateTime().getYear());
calendar.set(Calendar.MONTH, jDateTime.toLocalDateTime().getMonth().getValue() - 1); //java calendar index starting at 1.
calendar.set(Calendar.DAY_OF_MONTH, jDateTime.toLocalDateTime().getDayOfMonth());
calendar.set(Calendar.YEAR, localDateTime.getYear());
calendar.set(Calendar.MONTH, localDateTime.getMonth().getValue() - 1); //java calendar index starting at 1.
calendar.set(Calendar.DAY_OF_MONTH, localDateTime.getDayOfMonth());

int hour = (int) (remainder / (NANOS_PER_HOUR));
remainder = remainder % (NANOS_PER_HOUR);
Expand All @@ -119,7 +132,9 @@ public static Timestamp getTimestamp(NanoTime nt, ZoneId targetZone, boolean leg
calendar.set(Calendar.SECOND, seconds);

Timestamp ts = Timestamp.ofEpochMilli(calendar.getTimeInMillis(), (int) nanos);
ts = TimestampTZUtil.convertTimestampToZone(ts, ZoneOffset.UTC, targetZone, legacyConversion);
ts = TimestampTZUtil.convertTimestampToZone(ts, ZoneOffset.UTC, targetZone, legacyConversion)
.minusDays(leapYearDateAdjustment);

return ts;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTime;
import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.MethodSource;

Expand Down Expand Up @@ -97,6 +98,69 @@ void testWriteHive2ReadHive4UsingLegacyConversionWithZone(String timestampString
}
}

/**
* Tests that timestamps written using Hive2 APIs on julian leap years are read correctly by Hive4 APIs when legacy
* conversion is on.
*/
@ParameterizedTest(name = "{0}")
@MethodSource("generateTimestampsAndZoneIds")
void testWriteHive2ReadHive4UsingLegacyConversionWithJulianLeapYears(String timestampString, String zoneId) {
TimeZone original = TimeZone.getDefault();
try {
TimeZone.setDefault(TimeZone.getTimeZone(zoneId));
NanoTime nt = writeHive2(timestampString);
Timestamp ts = readHive4(nt, zoneId, true);
assertEquals(timestampString, ts.toString());
} finally {
TimeZone.setDefault(original);
}
}

@ParameterizedTest(name = "{0}")
@MethodSource("generateTimestampsAndZoneIds28thFeb")
void testWriteHive2ReadHive4UsingLegacyConversionWithJulianLeapYearsFor28thFeb(String timestampString,
String zoneId) {
TimeZone original = TimeZone.getDefault();
try {
TimeZone.setDefault(TimeZone.getTimeZone(zoneId));
NanoTime nt = writeHive2(timestampString);
Timestamp ts = readHive4(nt, zoneId, true);
assertEquals(timestampString, ts.toString());
} finally {
TimeZone.setDefault(original);
}
}

@ParameterizedTest(name = " - From: Zone {0}, timestamp: {2}, To: Zone:{1}, expected Timestamp {3}")
@MethodSource("julianLeapYearEdgeCases")
void testWriteHive2ReadHive4UsingLegacyConversionWithJulianLeapYearsEdgeCase(String fromZoneId, String toZoneId,
String timestampString, String expected) {
TimeZone original = TimeZone.getDefault();
try {
TimeZone.setDefault(TimeZone.getTimeZone(fromZoneId));
NanoTime nt = writeHive2(timestampString);
Timestamp ts = readHive4(nt, toZoneId, true);
assertEquals(expected, ts.toString());
} finally {
TimeZone.setDefault(original);
}
}

private static Stream<Arguments> julianLeapYearEdgeCases() {
return Stream.of(Arguments.of("GMT-12:00", "GMT+14:00", "0200-02-27 22:00:00.000000001",
"0200-03-01 00:00:00.000000001"),
Arguments.of("GMT+14:00", "GMT-12:00", "0200-03-01 00:00:00.000000001",
"0200-02-27 22:00:00.000000001"),
Arguments.of("GMT+14:00", "GMT-12:00", "0200-03-02 00:00:00.000000001",
"0200-02-28 22:00:00.000000001"),
Arguments.of("GMT-12:00", "GMT+14:00", "0200-03-02 00:00:00.000000001",
"0200-03-03 02:00:00.000000001"),
Arguments.of("GMT-12:00", "GMT+12:00", "0200-02-28 00:00:00.000000001", "0200-03-01 00:00:00.000000001"),
Arguments.of("GMT+12:00", "GMT-12:00", "0200-03-01 00:00:00.000000001", "0200-02-28 00:00:00.000000001"),
Arguments.of("Asia/Singapore", "Asia/Singapore", "0200-03-01 00:00:00.000000001",
"0200-03-01 00:00:00.000000001"));
}

/**
* Tests that timestamps written using Hive4 APIs are read correctly by Hive4 APIs when legacy conversion is on.
*/
Expand Down Expand Up @@ -178,6 +242,62 @@ public String get() {
.limit(3000), Stream.of("9999-12-31 23:59:59.999"));
}

/** Generates timestamps for different timezone. Here we are testing UTC+14 : Pacific/Kiritimati ,
* UTC-12 : Etc/GMT+12 along with few other zones
*/
private static Stream<Arguments> generateTimestampsAndZoneIds() {
return generateJulianLeapYearTimestamps().flatMap(
timestampString -> Stream.of("Asia/Singapore", "Pacific/Kiritimati", "Etc/GMT+12", "Pacific/Niue")
.map(zoneId -> Arguments.of(timestampString, zoneId)));
}

private static Stream<Arguments> generateTimestampsAndZoneIds28thFeb() {
return generateJulianLeapYearTimestamps28thFeb().flatMap(
timestampString -> Stream.of("Asia/Singapore", "Pacific/Kiritimati", "Etc/GMT+12", "Pacific/Niue")
.map(zoneId -> Arguments.of(timestampString, zoneId)));
}

private static Stream<String> generateJulianLeapYearTimestamps() {
return Stream.concat(Stream.generate(new Supplier<String>() {
int i = 0;

@Override
public String get() {
StringBuilder sb = new StringBuilder(29);
int year = ((i % 9999) + 1) * 100;
sb.append(zeros(4 - digits(year)));
sb.append(year);
sb.append("-03-01 00:00:00.000000001");
i++;
return sb.toString();
}
})
// Exclude dates falling in the default Gregorian change date since legacy code does not handle that interval
// gracefully. It is expected that these do not work well when legacy APIs are in use. 0200-03-01 01:01:01.000000001
.filter(s -> !s.startsWith("1582-10")).limit(3000), Stream.of("9999-12-31 23:59:59.999"));
}

private static Stream<String> generateJulianLeapYearTimestamps28thFeb() {
return Stream.concat(Stream.generate(new Supplier<String>() {
int i = 0;

@Override
public String get() {
StringBuilder sb = new StringBuilder(29);
int year = ((i % 9999) + 1) * 100;
sb.append(zeros(4 - digits(year)));
sb.append(year);
sb.append("-02-28 00:00:00.000000001");
i++;
return sb.toString();
}
})
// Exclude dates falling in the default Gregorian change date since legacy code does not handle that interval
// gracefully. It is expected that these do not work well when legacy APIs are in use. 0200-03-01 01:01:01.000000001
.filter(s -> !s.startsWith("1582-10")).limit(3000), Stream.of("9999-12-31 23:59:59.999"));
}


private static int digits(int number) {
int digits = 0;
do {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
--! qt:timezone:Asia/Singapore
--These files were created by inserting timestamp '2019-01-01 00:30:30.111111111' where writer time zone is Europe/Rome.

--older writer: time zone dependent behavior. convert to reader time zone
Expand All @@ -13,4 +14,15 @@ create table new_table (t timestamp) stored as parquet;

load data local inpath '../../data/files/parquet_historical_timestamp_new.parq' into table new_table;

select * from new_table;
select * from new_table;

-- Inserted data in singapore timezone: insert into default.test_sgt700 values ('0500-03-01 00:00:00'),('0600-03-01
-- 00:00:00'),('0700-03-01 00:00:00'), ('0200-03-01 00:00:00'), ('0400-03-01 00:00:00'), ('0800-03-02 00:00:00'),
--('0800-03-03 00:00:00');

CREATE EXTERNAL TABLE `TEST_SGT1`(`currtime` timestamp) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat';

LOAD DATA LOCAL INPATH '../../data/files/legacyLeapYearInSingaporeTZ' INTO TABLE TEST_SGT1;
SELECT * FROM TEST_SGT1;
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ POSTHOOK: query: select * from legacy_table
POSTHOOK: type: QUERY
POSTHOOK: Input: default@legacy_table
#### A masked pattern was here ####
2018-12-31 16:30:30.111111111
2019-01-01 08:30:30.111111111
PREHOOK: query: create table new_table (t timestamp) stored as parquet
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
Expand All @@ -48,3 +48,38 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: default@new_table
#### A masked pattern was here ####
2019-01-01 00:30:30.111111111
PREHOOK: query: CREATE EXTERNAL TABLE `TEST_SGT1`(`currtime` timestamp) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@TEST_SGT1
POSTHOOK: query: CREATE EXTERNAL TABLE `TEST_SGT1`(`currtime` timestamp) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' STORED AS
INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat'
OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat'
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@TEST_SGT1
PREHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/legacyLeapYearInSingaporeTZ' INTO TABLE TEST_SGT1
PREHOOK: type: LOAD
#### A masked pattern was here ####
PREHOOK: Output: default@test_sgt1
POSTHOOK: query: LOAD DATA LOCAL INPATH '../../data/files/legacyLeapYearInSingaporeTZ' INTO TABLE TEST_SGT1
POSTHOOK: type: LOAD
#### A masked pattern was here ####
POSTHOOK: Output: default@test_sgt1
PREHOOK: query: SELECT * FROM TEST_SGT1
PREHOOK: type: QUERY
PREHOOK: Input: default@test_sgt1
#### A masked pattern was here ####
POSTHOOK: query: SELECT * FROM TEST_SGT1
POSTHOOK: type: QUERY
POSTHOOK: Input: default@test_sgt1
#### A masked pattern was here ####
0500-03-01 00:00:00
0600-03-01 00:00:00
0700-03-01 00:00:00
0200-03-01 00:00:00
0400-03-01 00:00:00
0800-03-02 00:00:00
0800-03-03 00:00:00

0 comments on commit ea70fb8

Please sign in to comment.