SQL*Loader Advanced Techniques

This guide covers advanced SQL*Loader features and techniques for high-performance bulk data loading and complex data transformation scenarios.

Direct Path Loading

Direct path loading bypasses the SQL engine and writes directly to database blocks, providing significant performance improvements for large data loads.

1. Direct Path Basics

# Enable direct path loading
sqlldr hr/password control=employees.ctl direct=true

# Direct path with optimizations
sqlldr hr/password control=employees.ctl \
  direct=true \
  parallel=true \
  columnarrayrows=5000 \
  streamsize=256000

Control File for Direct Path:

LOAD DATA
INFILE 'large_employees.csv'
APPEND INTO TABLE employees
FIELDS TERMINATED BY ','
(
    employee_id,
    first_name,
    last_name,
    hire_date DATE "YYYY-MM-DD",
    salary
)

2. Direct Path Restrictions

Direct path loading has limitations:

No triggers fire during load
Referential integrity constraints must be disabled
Indexes can be maintained or rebuilt
Limited SQL expressions allowed

-- Pre-load preparation for direct path
ALTER TABLE employees DISABLE ALL TRIGGERS;
ALTER TABLE employees DISABLE CONSTRAINT fk_emp_dept;
ALTER INDEX emp_name_idx UNUSABLE;

-- Load data using direct path

-- Post-load tasks
ALTER TABLE employees ENABLE ALL TRIGGERS;
ALTER TABLE employees ENABLE CONSTRAINT fk_emp_dept;
ALTER INDEX emp_name_idx REBUILD PARALLEL 4;

Parallel Loading

1. Single File Parallel Loading

#!/bin/bash
# Load single large file using parallel processes

DATA_FILE="large_data.csv"
PARALLEL_DEGREE=4

# Split file into chunks
lines_per_file=$(($(wc -l < $DATA_FILE) / $PARALLEL_DEGREE))
split -l $lines_per_file $DATA_FILE chunk_

# Create control files for each chunk
for i in $(seq 1 $PARALLEL_DEGREE); do
    cat > load_chunk_${i}.ctl << EOF
LOAD DATA
INFILE 'chunk_a$(printf "%c" $((96+i)))'
APPEND INTO TABLE large_table
FIELDS TERMINATED BY ','
(
    id,
    description,
    amount,
    created_date DATE "YYYY-MM-DD"
)
EOF
done

# Load chunks in parallel
for i in $(seq 1 $PARALLEL_DEGREE); do
    sqlldr hr/password control=load_chunk_${i}.ctl \
           direct=true parallel=true \
           log=load_chunk_${i}.log &
done

# Wait for all processes to complete
wait
echo "Parallel loading completed"

2. Multiple File Parallel Loading

#!/bin/bash
# Load multiple files concurrently

FILES=(file1.csv file2.csv file3.csv file4.csv)
PARALLEL_DEGREE=${#FILES[@]}

# Function to load a single file
load_file() {
    local file=$1
    local basename=$(basename $file .csv)

    sqlldr hr/password control=load_template.ctl \
           data=$file \
           log=${basename}.log \
           bad=${basename}.bad \
           direct=true
}

# Load files in parallel
for file in "${FILES[@]}"; do
    load_file $file &
done

wait
echo "All files loaded"

Complex Data Transformations

1. Advanced Field Transformations

-- complex_transformations.ctl
LOAD DATA
INFILE 'employee_data.csv'
INTO TABLE employees
FIELDS TERMINATED BY ','
OPTIONALLY ENCLOSED BY '"'
TRAILING NULLCOLS
(
    employee_id,
    first_name "INITCAP(:first_name)",
    last_name "UPPER(:last_name)",
    full_name ":first_name || ' ' || :last_name",
    email "LOWER(:first_name) || '.' || LOWER(:last_name) || '@company.com'",
    phone "REGEXP_REPLACE(:phone, '[^0-9]', '')",
    hire_date DATE "YYYY-MM-DD",
    salary "TO_NUMBER(:salary, '999999.99')",
    commission "DECODE(:commission, '', NULL, TO_NUMBER(:commission)/100)",
    department_id "DECODE(:department, 'HR', 10, 'IT', 20, 'SALES', 30, 99)",
    status "DECODE(:status, 'A', 'ACTIVE', 'I', 'INACTIVE', 'UNKNOWN')",
    created_date SYSDATE,
    created_by CONSTANT 'SQLLDR'
)

2. Conditional Loading with WHEN Clause

-- conditional_loading.ctl
LOAD DATA
INFILE 'mixed_records.csv'
-- Load employees (records starting with 'E')
INTO TABLE employees
WHEN (1:1) = 'E'
FIELDS TERMINATED BY ','
(
    record_type FILLER,
    employee_id,
    first_name,
    last_name,
    department_id
)
-- Load departments (records starting with 'D')
INTO TABLE departments
WHEN (1:1) = 'D'
FIELDS TERMINATED BY ','
(
    record_type FILLER,
    department_id,
    department_name,
    location_id
)
-- Load locations (records starting with 'L')
INTO TABLE locations
WHEN (1:1) = 'L'
FIELDS TERMINATED BY ','
(
    record_type FILLER,
    location_id,
    street_address,
    city,
    state_province,
    country_id
)

3. Loading Hierarchical Data

-- hierarchical_loading.ctl
LOAD DATA
INFILE 'hierarchical_data.csv'
-- Parent records
INTO TABLE categories
WHEN (record_type = 'CAT')
FIELDS TERMINATED BY ','
(
    record_type FILLER,
    category_id,
    category_name,
    parent_id NULLIF parent_id = ''
)
-- Child records
INTO TABLE products
WHEN (record_type = 'PROD')
FIELDS TERMINATED BY ','
(
    record_type FILLER,
    product_id,
    product_name,
    category_id,
    unit_price "TO_NUMBER(:unit_price, '9999.99')"
)

LOB Data Loading

1. Loading CLOB Data

-- clob_loading.ctl
LOAD DATA
INFILE 'documents.csv'
INTO TABLE document_store
FIELDS TERMINATED BY ','
(
    doc_id,
    title,
    content CHAR(4000000),  -- Large character field
    created_date DATE "YYYY-MM-DD"
)

2. Loading External LOB Files

-- external_lob_loading.ctl
LOAD DATA
INFILE 'lob_metadata.csv'
INTO TABLE image_store
FIELDS TERMINATED BY ','
(
    image_id,
    image_name,
    image_data LOBFILE(image_path) TERMINATED BY EOF CHAR,
    image_path FILLER,
    uploaded_date SYSDATE
)

Data File (lob_metadata.csv):

1,logo.jpg,/images/logo.jpg
2,banner.png,/images/banner.png
3,profile.gif,/images/profile.gif

3. Loading Binary LOB Data

-- blob_loading.ctl
LOAD DATA
INFILE 'binary_files.csv'
INTO TABLE binary_store
FIELDS TERMINATED BY ','
(
    file_id,
    filename,
    file_size,
    file_data LOBFILE(file_path) TERMINATED BY EOF,
    file_path FILLER
)

Advanced Error Handling

1. Comprehensive Error Management

-- error_handling.ctl
LOAD DATA
INFILE 'employees.csv'
BADFILE 'employees.bad'
DISCARDFILE 'employees.dis'
APPEND INTO TABLE employees
ERRORS=1000              -- Allow up to 1000 bad records
ROWS=5000               -- Commit every 5000 rows
FIELDS TERMINATED BY ','
OPTIONALLY ENCLOSED BY '"'
TRAILING NULLCOLS
(
    employee_id "DECODE(:employee_id, '', NULL, TO_NUMBER(:employee_id))",
    first_name CHAR(50) NULLIF first_name=BLANKS,
    last_name CHAR(50) NULLIF last_name=BLANKS,
    email "CASE WHEN REGEXP_LIKE(:email, '^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$')
                THEN :email
                ELSE NULL
           END",
    hire_date DATE "YYYY-MM-DD" NULLIF hire_date=BLANKS,
    salary "CASE WHEN TO_NUMBER(:salary) > 0
                 THEN TO_NUMBER(:salary)
                 ELSE NULL
            END",
    department_id "DECODE(:department_id, '', 99, TO_NUMBER(:department_id))"
)

2. Custom Error Logging

#!/bin/bash
# Load with comprehensive error tracking

DATA_FILE=$1
TABLE_NAME=$2
TIMESTAMP=$(date +%Y%m%d_%H%M%S)

# Validate inputs
if [ -z "$DATA_FILE" ] || [ -z "$TABLE_NAME" ]; then
    echo "Usage: $0 <data_file> <table_name>"
    exit 1
fi

BASE_NAME=$(basename $DATA_FILE .csv)
CONTROL_FILE="${BASE_NAME}.ctl"
LOG_FILE="${BASE_NAME}_${TIMESTAMP}.log"
BAD_FILE="${BASE_NAME}_${TIMESTAMP}.bad"
DISCARD_FILE="${BASE_NAME}_${TIMESTAMP}.dis"

# Count input records
INPUT_COUNT=$(wc -l < $DATA_FILE)
echo "Input records: $INPUT_COUNT"

# Run SQL*Loader
sqlldr hr/password \
  control=$CONTROL_FILE \
  data=$DATA_FILE \
  log=$LOG_FILE \
  bad=$BAD_FILE \
  discard=$DISCARD_FILE \
  errors=1000

# Analyze results
if [ -f "$LOG_FILE" ]; then
    LOADED=$(grep "successfully loaded" $LOG_FILE | awk '{print $1}')
    REJECTED=$(grep "rejected" $LOG_FILE | awk '{print $1}')
    DISCARDED=$(grep "discarded" $LOG_FILE | awk '{print $1}')

    echo "Results:"
    echo "  Loaded: ${LOADED:-0}"
    echo "  Rejected: ${REJECTED:-0}"
    echo "  Discarded: ${DISCARDED:-0}"

    # Check for errors
    if [ -f "$BAD_FILE" ] && [ -s "$BAD_FILE" ]; then
        echo "Bad records found in: $BAD_FILE"
        echo "Sample bad records:"
        head -5 $BAD_FILE
    fi

    # Generate error report
    if [ "${REJECTED:-0}" -gt 0 ]; then
        echo "Generating error report..."
        grep "ORA-" $LOG_FILE > error_summary_${TIMESTAMP}.txt
    fi
fi

Performance Optimization

1. Memory and Buffer Optimization

#!/bin/bash
# SQL*Loader with optimal performance settings

sqlldr hr/password \
  control=large_table.ctl \
  direct=true \
  parallel=true \
  bindsize=2097152 \        # 2MB bind array
  readsize=2097152 \        # 2MB read buffer
  rows=10000 \              # Commit every 10K rows
  columnarrayrows=5000 \    # Column array size
  streamsize=256000 \       # Stream buffer size
  multithreading=true \
  log=optimized_load.log

2. System-Level Optimizations

#!/bin/bash
# Prepare system for optimal loading performance

# Database preparation
sqlplus -s / as sysdba << EOF
-- Increase SGA if needed
ALTER SYSTEM SET sga_max_size=4G SCOPE=SPFILE;
ALTER SYSTEM SET sga_target=4G;

-- Optimize for bulk operations
ALTER SYSTEM SET db_writer_processes=8;
ALTER SYSTEM SET large_pool_size=512M;

-- Disable archive logging temporarily (if allowed)
ALTER DATABASE NOARCHIVELOG;

-- Set tablespace to NOLOGGING
ALTER TABLESPACE users NOLOGGING;
EOF

# Disable constraints and indexes
sqlplus -s hr/password << EOF
-- Disable constraints
ALTER TABLE employees DISABLE CONSTRAINT pk_employees;
ALTER TABLE employees DISABLE CONSTRAINT fk_emp_dept;

-- Make indexes unusable
ALTER INDEX idx_emp_name UNUSABLE;
ALTER INDEX idx_emp_dept UNUSABLE;
EOF

# Perform the load
sqlldr hr/password control=massive_load.ctl direct=true parallel=true

# Post-load optimization
sqlplus -s hr/password << EOF
-- Rebuild indexes in parallel
ALTER INDEX idx_emp_name REBUILD PARALLEL 8 NOLOGGING;
ALTER INDEX idx_emp_dept REBUILD PARALLEL 8 NOLOGGING;

-- Re-enable constraints
ALTER TABLE employees ENABLE CONSTRAINT pk_employees;
ALTER TABLE employees ENABLE CONSTRAINT fk_emp_dept;

-- Gather statistics
EXEC DBMS_STATS.gather_table_stats('HR','EMPLOYEES',degree=>8);
EOF

# Re-enable logging
sqlplus -s / as sysdba << EOF
ALTER TABLESPACE users LOGGING;
ALTER DATABASE ARCHIVELOG;
EOF

Complex Control File Templates

1. Multi-Format Data Loading

-- multi_format_loading.ctl
LOAD DATA
-- Fixed-width header record
INFILE 'complex_data.txt'
INTO TABLE file_headers
WHEN (1:3) = 'HDR'
(
    record_type POSITION(1:3) CHAR,
    file_id POSITION(4:13) INTEGER EXTERNAL,
    file_date POSITION(14:23) DATE "YYYY-MM-DD",
    record_count POSITION(24:33) INTEGER EXTERNAL
)
-- Delimited detail records
INTO TABLE file_details
WHEN (1:3) = 'DTL'
FIELDS TERMINATED BY '|'
(
    record_type FILLER,
    line_number,
    product_code,
    quantity INTEGER EXTERNAL,
    unit_price DECIMAL EXTERNAL,
    line_total "TO_NUMBER(:quantity) * TO_NUMBER(:unit_price)"
)
-- Fixed trailer record
INTO TABLE file_trailers
WHEN (1:3) = 'TRL'
(
    record_type POSITION(1:3) CHAR,
    total_records POSITION(4:13) INTEGER EXTERNAL,
    total_amount POSITION(14:25) DECIMAL EXTERNAL
)

2. Data Validation and Cleansing

-- data_validation.ctl
LOAD DATA
INFILE 'dirty_data.csv'
BADFILE 'validation_errors.bad'
INTO TABLE clean_customers
FIELDS TERMINATED BY ','
OPTIONALLY ENCLOSED BY '"'
TRAILING NULLCOLS
(
    customer_id "CASE WHEN REGEXP_LIKE(:customer_id, '^[0-9]+$')
                      THEN TO_NUMBER(:customer_id)
                      ELSE NULL
                 END",

    customer_name "CASE WHEN LENGTH(TRIM(:customer_name)) > 0
                        THEN INITCAP(TRIM(:customer_name))
                        ELSE NULL
                   END",

    email "CASE WHEN REGEXP_LIKE(LOWER(TRIM(:email)),
                                 '^[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}$')
                THEN LOWER(TRIM(:email))
                ELSE NULL
           END",

    phone "REGEXP_REPLACE(REGEXP_REPLACE(:phone, '[^0-9]', ''),
                          '^1?([0-9]{3})([0-9]{3})([0-9]{4})$',
                          '(\1) \2-\3')",

    birth_date "CASE WHEN TO_DATE(:birth_date, 'MM/DD/YYYY') < SYSDATE
                     AND TO_DATE(:birth_date, 'MM/DD/YYYY') > DATE '1900-01-01'
                     THEN TO_DATE(:birth_date, 'MM/DD/YYYY')
                     ELSE NULL
                END",

    credit_limit "CASE WHEN TO_NUMBER(:credit_limit) >= 0
                       THEN TO_NUMBER(:credit_limit)
                       ELSE 0
                  END",

    created_date SYSDATE,
    data_source CONSTANT 'CSV_IMPORT'
)

Monitoring and Reporting

1. Real-Time Load Monitoring

-- monitor_sqlloader.sql
-- Monitor SQL*Loader sessions in real-time

-- Current SQL*Loader sessions
SELECT s.sid, s.serial#, s.username, s.program,
       s.status, s.logon_time,
       st.value/1024/1024 mb_processed
FROM v$session s, v$sesstat st, v$statname sn
WHERE s.sid = st.sid
AND st.statistic# = sn.statistic#
AND sn.name = 'bytes received via SQL*Net from client'
AND s.program LIKE '%sqlldr%'
ORDER BY st.value DESC;

-- I/O activity for loading sessions
SELECT s.sid, s.program,
       i.physical_reads, i.physical_writes,
       i.logical_reads
FROM v$session s, v$sess_io i
WHERE s.sid = i.sid
AND s.program LIKE '%sqlldr%';

-- Current long operations
SELECT sid, opname, target, sofar, totalwork,
       ROUND(sofar/totalwork*100,2) pct_complete,
       time_remaining/60 mins_remaining
FROM v$session_longops
WHERE opname LIKE '%SQL*Loader%'
AND sofar <> totalwork;

2. Comprehensive Load Reporting

#!/bin/bash
# Generate comprehensive loading report

LOG_FILE=$1
REPORT_FILE="load_report_$(date +%Y%m%d_%H%M%S).html"

if [ -z "$LOG_FILE" ]; then
    echo "Usage: $0 <sqlloader_log_file>"
    exit 1
fi

# Generate HTML report
cat > $REPORT_FILE << EOF
<!DOCTYPE html>
<html>
<head>
    <title>SQL*Loader Report</title>
    <style>
        body { font-family: Arial, sans-serif; margin: 20px; }
        table { border-collapse: collapse; width: 100%; }
        th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
        th { background-color: #f2f2f2; }
        .success { color: green; }
        .error { color: red; }
        .warning { color: orange; }
    </style>
</head>
<body>
    <h1>SQL*Loader Report</h1>
    <h2>Load Summary</h2>
    <table>
EOF

# Extract key metrics from log file
TOTAL_RECORDS=$(grep -o "[0-9]* Records loaded" $LOG_FILE | awk '{print $1}')
REJECTED_RECORDS=$(grep -o "[0-9]* Records rejected" $LOG_FILE | awk '{print $1}')
DISCARDED_RECORDS=$(grep -o "[0-9]* Records discarded" $LOG_FILE | awk '{print $1}')
ELAPSED_TIME=$(grep -o "Elapsed time was: [^$]*" $LOG_FILE | cut -d: -f2-)

# Add metrics to report
cat >> $REPORT_FILE << EOF
        <tr><td>Total Records Loaded</td><td class="success">${TOTAL_RECORDS:-0}</td></tr>
        <tr><td>Records Rejected</td><td class="error">${REJECTED_RECORDS:-0}</td></tr>
        <tr><td>Records Discarded</td><td class="warning">${DISCARDED_RECORDS:-0}</td></tr>
        <tr><td>Elapsed Time</td><td>${ELAPSED_TIME:-Unknown}</td></tr>
    </table>

    <h2>Errors</h2>
    <pre>
EOF

# Add error details
grep "ORA-" $LOG_FILE >> $REPORT_FILE 2>/dev/null || echo "No errors found" >> $REPORT_FILE

cat >> $REPORT_FILE << EOF
    </pre>
</body>
</html>
EOF

echo "Report generated: $REPORT_FILE"

Best Practices for Advanced Loading

1. Performance Checklist

#!/bin/bash
# Verify optimal configuration for large loads

echo "SQL*Loader Performance Checklist"
echo "================================"

# Check available memory
TOTAL_MEM=$(free -g | awk '/^Mem:/{print $2}')
echo "Available Memory: ${TOTAL_MEM}GB"

# Check CPU cores
CPU_CORES=$(nproc)
echo "CPU Cores: $CPU_CORES"

# Check I/O subsystem
echo "I/O Performance Test:"
dd if=/dev/zero of=/tmp/testfile bs=1G count=1 oflag=direct 2>&1 | grep copied

# Database checks
sqlplus -s / as sysdba << EOF
SET PAGESIZE 0 FEEDBACK OFF

-- SGA settings
SELECT 'SGA Target: ' || value/1024/1024 || 'MB' FROM v\$parameter WHERE name = 'sga_target';
SELECT 'Large Pool: ' || value/1024/1024 || 'MB' FROM v\$parameter WHERE name = 'large_pool_size';

-- Process settings
SELECT 'Processes: ' || value FROM v\$parameter WHERE name = 'processes';
SELECT 'DB Writer Processes: ' || value FROM v\$parameter WHERE name = 'db_writer_processes';

-- Archive mode
SELECT 'Archive Mode: ' || log_mode FROM v\$database;
EOF

echo "Recommendations:"
echo "- Use direct=true for tables > 1M rows"
echo "- Set parallel=true for multi-CPU systems"
echo "- Increase bindsize and readsize for large files"
echo "- Disable unnecessary constraints and indexes"
echo "- Consider NOLOGGING for temporary data"

This advanced guide provides sophisticated techniques for handling complex SQL*Loader scenarios in production environments.