// Solution to the challenge at:
// https://www.hackerrank.com/challenges/save-humanity/
//
// This solution improves upon the brute force SSE2 approach by
// precomputing and skipping character repetitions.
//
// By Daniel L. Taylor
// Copyright (C) 2019-2024 Taylor Design. All Rights Reserved.
// Discussed on my blog at:
// http://www.taylordesign.net/code-optimization/saving-humanity/

#include <math.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include <limits.h>
#include <stdbool.h>
#include <errno.h>
#include <time.h>
#include <x86intrin.h>

#define kMaxSize 100016
#define kMaxMismatch 1
#define kCompareSize 16
#define kMinRepetitionJump 16

void fillRepetitionArray(char *s, int sc, int *sr);
int repetitionJump(int *pr, int *vr, int i);
int findDna_SSE2(char *p, int *pr, char *v, int *vr, int vc);

int main()
{
    // The code below sets files for stdin and stdout, and times the results for testing.
    // If left in it will cause the solution to fail at HackerRank.
    //char test_file_path[] = "";
    //freopen(test_file_path, "r", stdin);
    //char test_file_out[] = "";
    //freopen(test_file_out, "w", stdout);
    
    // Start the clock.
    //static double time_consumed = 0;
    //clock_t start, end;
    //start = clock();
    
    // Allocate memory for strings.
    char *p = malloc(kMaxSize * sizeof(char));
    char *v = malloc(kMaxSize * sizeof(char));
    
    // Allocate memory for repetition counts.
    int *pr = malloc(kMaxSize * sizeof(int));
    int *vr = malloc(kMaxSize * sizeof(int));
    
    // Run the test cases.
    int tc;
    scanf("%d", &tc);
    while (0 < tc--)
    {
        // Load the strings.
        scanf("%s %s", p, v);
        int pc = (int)strlen(p);
        int vc = (int)strlen(v);
        
        // Build the repetition arrays.
        fillRepetitionArray(p, pc, pr);
        fillRepetitionArray(v, vc, vr);
        
        // Look for v in p. Print the starting index of each match.
        int c = (pc-vc);
        int matched = 0;
        for (int i = 0; i <= c; i++){
            if (findDna_SSE2(&p[i], &pr[i], v, vr, vc) == 1){
                matched++;
                printf("%d ", i);
            }
        }
        
        // We have to indicate if no matches were found.
        if (matched <= 0)
            printf("No Match!\n");
        else
            printf("\n");
    }
    
    // Cleanup
    free(p);
    free(v);
    free(pr);
    free(vr);
    
    // Comment out or this will fail HackerRank.
    //end = clock();
    //time_consumed += (double)(end - start);
    //printf("%f\n", time_consumed / CLOCKS_PER_SEC);
    
    return 0;
}


void fillRepetitionArray(char *s, int sc, int *sr)
{
    // Preflight
    if (sc <= 0) return;
    if (sc == 1){
        sr[0] = 1;
        return;
    }
    
    // Prime the loop.
    char curChar = s[sc-1];
    int curCount = 1;
    sr[sc-1] = curCount;
    
    // Build the array.
    for (int i = sc-2; i >= 0; i--)
    {
        char nextChar = s[i];
        if (nextChar == curChar){
            curCount++;
        }
        else{
            curChar = nextChar;
            curCount = 1;
        }
        
        sr[i] = curCount;
    }
}

int repetitionJump(int *pr, int *vr, int i)
{
    // Get the smaller repetition.
    int v = pr[i] < vr[i] ? pr[i]:vr[i];
    if (v < kMinRepetitionJump)
        return 0;
    else
        return v - (v % kCompareSize); // Align to kCompareSize boundary.
}

int findDna_SSE2(char *p, int *pr, char *v, int *vr, int vc)
{
    // Common variables.
    int mismatch = 0;
    int c = vc - (vc % kCompareSize);
    int i = 0;
    
    for (; i < c; i += kCompareSize)
    {
        // We have to test for equality at i before checking for repetition.
        // If not equal we might as well execute the mismatch counter.
        if (p[i] != v[i])
        {
            int jc = i + kCompareSize;
            for (int j = i; j < jc; j++){
                if (p[j] != v[j]){
                    mismatch++;
                    if (mismatch > kMaxMismatch)
                        return -1;
                }
            }
        }
        else
        {
            // Can we jump ahead based on repetition?
            int jumpBy = repetitionJump(pr, vr, i);
            if (jumpBy > 0)
            {
                jumpBy -= kCompareSize; // The for loop will add kCompareSize.
                i += jumpBy; // Advance the index.
            }
            else
            {
                // Compare 16-bytes at i using SSE2 vector instructions.
                __m128i pvec = _mm_loadu_si128((void*)&p[i]);
                __m128i vvec = _mm_loadu_si128((void*)&v[i]);
                __m128i result = _mm_cmpeq_epi8(pvec, vvec);
                
                // This is a bit faster but generates a compiler error on HackerRank.
                //int matched = _mm_test_all_ones(result);
                //if (matched != 1)
                
                // This is a bit slower but compiles on HackerRank.
                long long *r = (long long *)&result;
                if (r[0] != -1 || r[1] != -1)
                {
                    // There was one or more mismatches. Count them.
                    int jc = i + kCompareSize;
                    for (int j = i; j < jc; j++){
                        if (p[j] != v[j]){
                            mismatch++;
                            if (mismatch > kMaxMismatch)
                                return -1;
                        }
                    }
                }
            }
        }
    }
    
    // Check the remaining characters beyond the kCompareSize boundary.
    for (int j = i; j < vc; j++){
        if (p[j] != v[j]){
            mismatch++;
            if (mismatch > kMaxMismatch)
                return -1;
        }
    }
    
    return 1;
}
