请教论坛的各位大大以及唐老师,QNX下一个奇怪的问题:一个函数中,多加条语句,耗时明变少了;少条语句,耗时却明显增多了?

(02.24更新)基本确定问题所在:应该是编译器优化掉的缘故,我将多加赋值语句的程序的工程分别开启与关闭编译器优化,获得的结果正好是起初问题中提到的时间上的差异,谢谢诸位的关注


唐老师以及各位论坛的大大,我现在遇到一个奇怪的问题:在一个函数中,多一条语句执行的时间却比少一条语句耗时多得多:
大致情况如下:在Main函数中执行一个死循环,死循环中多次执行一个函数,当该函数中加入一条赋值语句(data = 5000000;) 时,耗时减少;反之少了那条语句,耗时却明显变多了,这个是为何呢?照理说,反过来才对哈。系统为QNX6.5 ,硬件为X86架构;
Main函数中死循环如下:

while(1)
	{
		PCIeTest(index, slot, channel);
		PCIeTest(index, slot, channel);
		PCIeTest(index, slot, channel);
		PCIeTest(index, slot, channel);
		PCIeTest(index, slot, channel);
		PCIeTest(index, slot, channel);
		PCIeTest(index, slot, channel);
		PCIeTest(index, slot, channel);
		usleep(100);
	}

**PCIeTest()**就是我上文中提到的函数,内容如下:

double PCIeTest(unsigned index, unsigned slot, unsigned channel)
{
	double data = 0;
	int dat;
	unsigned offset, addr;
	addr = 0xA05F + 255 * (channel - 1);
	offset = 4 * (slot + 9);
	addr0 = (uint32_t *) (pciCard[index].mmapAddress[0] + offset);
	addr4 = (uint32_t *) (pciCard[index].mmapAddress[0] + 4 * 24);
	*addr0 = addr;
	dat= (*addr4) / SHIFT16;
	dat = 1000000 - 61*dat;
	data = (double)dat/100000;
//	data = 5000000;    /*****所说的多的那条语句*******/
	return data;
}

其汇编代码如下:

{
0x08048d7d <PCIeTest>:     push   %ebp
0x08048d7e <PCIeTest+1>:   mov    %esp,%ebp
0x08048d80 <PCIeTest+3>:   sub    $0x20,%esp
	double data = 0;
0x08048d83 <PCIeTest+6>:   fldz   
0x08048d85 <PCIeTest+8>:   fstpl  -0x18(%ebp)
	addr = 0xA05F + 255 * (channel - 1);
0x08048d88 <PCIeTest+11>:  mov    0x10(%ebp),%eax
0x08048d8b <PCIeTest+14>:  mov    %eax,%edx
0x08048d8d <PCIeTest+16>:  shl    $0x8,%edx
0x08048d90 <PCIeTest+19>:  mov    %edx,%ecx
0x08048d92 <PCIeTest+21>:  sub    %eax,%ecx
0x08048d94 <PCIeTest+23>:  mov    %ecx,%eax
0x08048d96 <PCIeTest+25>:  add    $0x9f60,%eax
0x08048d9b <PCIeTest+30>:  mov    %eax,-0x4(%ebp)
	offset = 4 * (slot + 9);
0x08048d9e <PCIeTest+33>:  mov    0xc(%ebp),%eax
0x08048da1 <PCIeTest+36>:  add    $0x9,%eax
0x08048da4 <PCIeTest+39>:  shl    $0x2,%eax
0x08048da7 <PCIeTest+42>:  mov    %eax,-0x8(%ebp)
	addr0 = (uint32_t *) (pciCard[index].mmapAddress[0] + offset);
0x08048daa <PCIeTest+45>:  mov    0x8(%ebp),%edx
0x08048dad <PCIeTest+48>:  mov    %edx,%eax
0x08048daf <PCIeTest+50>:  shl    $0x2,%eax
0x08048db2 <PCIeTest+53>:  add    %edx,%eax
0x08048db4 <PCIeTest+55>:  shl    $0x4,%eax
0x08048db7 <PCIeTest+58>:  mov    0x804afb4(%eax),%eax
0x08048dbd <PCIeTest+64>:  mov    -0x8(%ebp),%edx
0x08048dc0 <PCIeTest+67>:  shl    $0x2,%edx
0x08048dc3 <PCIeTest+70>:  add    %edx,%eax
0x08048dc5 <PCIeTest+72>:  mov    %eax,0x804af50
	addr4 = (uint32_t *) (pciCard[index].mmapAddress[0] + 4 * 24);
0x08048dca <PCIeTest+77>:  mov    0x8(%ebp),%edx
0x08048dcd <PCIeTest+80>:  mov    %edx,%eax
0x08048dcf <PCIeTest+82>:  shl    $0x2,%eax
0x08048dd2 <PCIeTest+85>:  add    %edx,%eax
0x08048dd4 <PCIeTest+87>:  shl    $0x4,%eax
0x08048dd7 <PCIeTest+90>:  mov    0x804afb4(%eax),%eax
0x08048ddd <PCIeTest+96>:  add    $0x180,%eax
0x08048de2 <PCIeTest+101>: mov    %eax,0x804af54
	*addr0 = addr;
0x08048de7 <PCIeTest+106>: mov    0x804af50,%eax
0x08048dec <PCIeTest+111>: mov    -0x4(%ebp),%edx
0x08048def <PCIeTest+114>: mov    %edx,(%eax)
	dat= (*addr4) / SHIFT16;
0x08048df1 <PCIeTest+116>: mov    0x804af54,%eax
0x08048df6 <PCIeTest+121>: mov    (%eax),%eax
0x08048df8 <PCIeTest+123>: shr    $0x10,%eax
0x08048dfb <PCIeTest+126>: mov    %eax,-0xc(%ebp)
	dat = 1000000 - 61*dat;
0x08048dfe <PCIeTest+129>: mov    -0xc(%ebp),%eax
0x08048e01 <PCIeTest+132>: imul   $0xffffffc3,%eax,%eax
0x08048e04 <PCIeTest+135>: add    $0xf4240,%eax
0x08048e09 <PCIeTest+140>: mov    %eax,-0xc(%ebp)
	data = (double)dat/100000;
0x08048e0c <PCIeTest+143>: fildl  -0xc(%ebp)
0x08048e0f <PCIeTest+146>: fldl   0x8049858
0x08048e15 <PCIeTest+152>: fdivrp %st,%st(1)
0x08048e17 <PCIeTest+154>: fstpl  -0x18(%ebp)
	return data;
0x08048e1a <PCIeTest+157>: fldl   -0x18(%ebp)
}
0x08048e1d <PCIeTest+160>: leave  
0x08048e1e <PCIeTest+161>: ret

当加一条赋值**data = 5000000;**语句时,汇编代码如下:

{
0x08048d7d <PCIeTest>:     push   %ebp
0x08048d7e <PCIeTest+1>:   mov    %esp,%ebp
0x08048d80 <PCIeTest+3>:   sub    $0x20,%esp
	double data = 0;
0x08048d83 <PCIeTest+6>:   fldz   
0x08048d85 <PCIeTest+8>:   fstpl  -0x18(%ebp)
	addr = 0xA05F + 255 * (channel - 1);
0x08048d88 <PCIeTest+11>:  mov    0x10(%ebp),%eax
0x08048d8b <PCIeTest+14>:  mov    %eax,%edx
0x08048d8d <PCIeTest+16>:  shl    $0x8,%edx
0x08048d90 <PCIeTest+19>:  mov    %edx,%ecx
0x08048d92 <PCIeTest+21>:  sub    %eax,%ecx
0x08048d94 <PCIeTest+23>:  mov    %ecx,%eax
0x08048d96 <PCIeTest+25>:  add    $0x9f60,%eax
0x08048d9b <PCIeTest+30>:  mov    %eax,-0x4(%ebp)
	offset = 4 * (slot + 9);
0x08048d9e <PCIeTest+33>:  mov    0xc(%ebp),%eax
0x08048da1 <PCIeTest+36>:  add    $0x9,%eax
0x08048da4 <PCIeTest+39>:  shl    $0x2,%eax
0x08048da7 <PCIeTest+42>:  mov    %eax,-0x8(%ebp)
	addr0 = (uint32_t *) (pciCard[index].mmapAddress[0] + offset);
0x08048daa <PCIeTest+45>:  mov    0x8(%ebp),%edx
0x08048dad <PCIeTest+48>:  mov    %edx,%eax
0x08048daf <PCIeTest+50>:  shl    $0x2,%eax
0x08048db2 <PCIeTest+53>:  add    %edx,%eax
0x08048db4 <PCIeTest+55>:  shl    $0x4,%eax
0x08048db7 <PCIeTest+58>:  mov    0x804afb4(%eax),%eax
0x08048dbd <PCIeTest+64>:  mov    -0x8(%ebp),%edx
0x08048dc0 <PCIeTest+67>:  shl    $0x2,%edx
0x08048dc3 <PCIeTest+70>:  add    %edx,%eax
0x08048dc5 <PCIeTest+72>:  mov    %eax,0x804af50
	addr4 = (uint32_t *) (pciCard[index].mmapAddress[0] + 4 * 24);
0x08048dca <PCIeTest+77>:  mov    0x8(%ebp),%edx
0x08048dcd <PCIeTest+80>:  mov    %edx,%eax
0x08048dcf <PCIeTest+82>:  shl    $0x2,%eax
0x08048dd2 <PCIeTest+85>:  add    %edx,%eax
0x08048dd4 <PCIeTest+87>:  shl    $0x4,%eax
0x08048dd7 <PCIeTest+90>:  mov    0x804afb4(%eax),%eax
0x08048ddd <PCIeTest+96>:  add    $0x180,%eax
0x08048de2 <PCIeTest+101>: mov    %eax,0x804af54
	*addr0 = addr;
0x08048de7 <PCIeTest+106>: mov    0x804af50,%eax
0x08048dec <PCIeTest+111>: mov    -0x4(%ebp),%edx
0x08048def <PCIeTest+114>: mov    %edx,(%eax)
	dat= (*addr4) / SHIFT16;
0x08048df1 <PCIeTest+116>: mov    0x804af54,%eax
0x08048df6 <PCIeTest+121>: mov    (%eax),%eax
0x08048df8 <PCIeTest+123>: shr    $0x10,%eax
0x08048dfb <PCIeTest+126>: mov    %eax,-0xc(%ebp)
	dat = 1000000 - 61*dat;
0x08048dfe <PCIeTest+129>: mov    -0xc(%ebp),%eax
0x08048e01 <PCIeTest+132>: imul   $0xffffffc3,%eax,%eax
0x08048e04 <PCIeTest+135>: add    $0xf4240,%eax
0x08048e09 <PCIeTest+140>: mov    %eax,-0xc(%ebp)
	data = (double)dat/100000;
0x08048e0c <PCIeTest+143>: fildl  -0xc(%ebp)
0x08048e0f <PCIeTest+146>: fldl   0x8049858
0x08048e15 <PCIeTest+152>: fdivrp %st,%st(1)
0x08048e17 <PCIeTest+154>: fstpl  -0x18(%ebp)
	data = 5000000;
0x08048e1a <PCIeTest+157>: fldl   0x8049860
0x08048e20 <PCIeTest+163>: fstpl  -0x18(%ebp)
	return data;
0x08048e23 <PCIeTest+166>: fldl   -0x18(%ebp)
}
0x08048e26 <PCIeTest+169>: leave  
0x08048e27 <PCIeTest+170>: ret

可是用QNX Momentic System Profiler测试的结果却是非常之诡异:
当多一条赋值(data = 5000000;)时,耗时大概2.471us,;而少这条语句时,耗时却明显多得多**(15.164us)**。
问题:这个貌似很不合理哈,怎么多一句,耗时变少了;少一句,耗时却变多了呢?照理说,放过来才对吧。却请问各位大大,这个是什么情况呢,真的很费解。
1.1 、下图1 为少一条赋值语句时的汇编截图:

图1 少赋值语句data = 5000000;时的汇编代码
1.2 、下图2 为少一条赋值语句时的System Profiler截图:

图2 少赋值语句data = 5000000;时的System Profiler
2.1 、下图3 为多一条赋值语句时的汇编截图:

图3 多赋值语句data = 5000000;时的汇编代码
2.2 、下图4 为多一条赋值语句时的System Profiler截图:

图4 多赋值语句data = 5000000;时的System Profiler

无论从代码量来说还是占用CPU周期来说,都是语句越多,耗时越长!可是我这个情况恰恰相反;

猜测一下
有可能是系统调度引起的问题

虽然汇编看起来是变长了,但是如果系统没有调度,一直顺着走完了

有时候虽然代码短,但是中间系统跑去干其他事情了,时间就长了

不太可能啊。这个线程一直占着CPU不放呢,期间没有别的事件发生哈。